In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/openintro-possum/possum.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Data Cleaning

In [None]:
df.info()

In [None]:
df['Pop'].value_counts()

In [None]:
df['sex'].value_counts()

In [None]:
#converting object data types into integters
df['Pop']=list(map(int,df['Pop']=='other'))
df['sex'] = list(map(int, df['sex']=='f'))

In [None]:
df.head()

In [None]:
# dropping case column as it is a redundant index
df.drop('case', axis = 1, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
# we'll impute the three rows with missing data based on the mean in these rows
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values = np.nan, strategy='mean')
df = imp_mean.fit_transform(df)

In [None]:
# reconverting to pandas dataframe and checking our work.
df = pd.DataFrame(df, columns = ['site', 'Pop', 'sex', 'age', 'hdlngth', 'skullw', 'totlngth',
       'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly'])
df.isnull().sum()

In [None]:
# All looks good!
df.head()

# Exploratory Data Analysis

In [None]:
# what do we have here? 

# some interesting pocket correlations: 

# foot length positively correlated with ear conch size
# hind length, total length and skull width positvely correlated.
plt.figure(figsize = (12,8))
sns.heatmap(df.corr(), annot = True, cmap = 'Spectral')

In [None]:
#earconch vs. footlength by sex. Orange is female
sns.scatterplot(x = 'footlgth', y = 'earconch', data =df, hue = 'sex')

In [None]:
#skull width vs. hind length by sex. Orange is female.
sns.scatterplot(x = 'skullw', y = 'hdlngth', data =df, hue = 'sex')

In [None]:
#skull width vs. total length by sex. Orange is female.
sns.scatterplot(x = 'skullw', y = 'totlngth', data =df, hue = 'sex')

In [None]:
#hind length vs. total length by sex. Orange is female.
sns.scatterplot(x = 'hdlngth', y = 'totlngth', data =df, hue = 'sex')

In [None]:
# distribution of age in dataset. More males present, but similar distribution of both sexes.
sns.histplot(x = 'age', data =df, hue = 'sex')

In [None]:
# distribution of hind length in dataset. Evenly distibuted between males and females.
sns.histplot(x = 'hdlngth', data =df, hue = 'sex')

In [None]:
# distribution of skull width in dataset. Evenly distibuted between males and females.
sns.histplot(x = 'skullw', data =df, hue = 'sex')

In [None]:
# distribution of total length in dataset. Female population appears to skew toward longer size.
sns.histplot(x = 'totlngth', data =df, hue = 'sex')

In [None]:
# distribution of tail length in dataset. Intesting bimodel distribution here. 
sns.histplot(x = 'taill', data =df, hue = 'sex')

In [None]:
# distribution of foot length in dataset. Sampled males skew towards smaller foot. Sampled females skew 
#towards slightly larger foot.
sns.histplot(x = 'footlgth', data =df, hue = 'sex')

In [None]:
# distribution of ear conch in dataset. Another bimodel distribution in both sexes.
sns.histplot(x = 'earconch', data =df, hue = 'sex')

In [None]:
# distribution of eye in dataset. Normal distirbution.
sns.histplot(x = 'eye', data =df, hue = 'sex')

In [None]:
# distribution of chest in dataset. Somewhat normal distirbution.
sns.histplot(x = 'chest', data =df, hue = 'sex')

In [None]:
# distribution of belly in dataset. Somewhat normal distirbution.
sns.histplot(x = 'belly', data =df, hue = 'sex')

# Classification of Sex

In [None]:
#Given how similar the sexes are in all meaasured features, can we accurately classify male and female possums? 

# This will be our first classification task with this dataset

# Data and target feature

In [None]:
X = df.drop('sex', axis = 1)
y = df['sex']

# train test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

In [None]:
# We'll train several classifiers: Logistic regression, Decision Tree Classifier, Random Forest Classifer and 
# XGBoost Classifer

#Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver = 'newton-cg')
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)

In [None]:
# Logistic regression struggles here

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,lr_pred))
print('\n')
print(confusion_matrix(y_test,lr_pred))

lr_acc = accuracy_score(y_test,lr_pred)

In [None]:
#Decision Tree Classifer

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
dtc_pred = dtc.predict(X_test)

In [None]:
# Decision tree classification struggles even more.

print(classification_report(y_test,dtc_pred))
print('\n')
print(confusion_matrix(y_test,dtc_pred))

dtc_acc = accuracy_score(y_test,dtc_pred)

In [None]:
#Random Forest Classifer

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
# Random forest roughly matches logistic regression in performance

print(classification_report(y_test,rfc_pred))
print('\n')
print(confusion_matrix(y_test,rfc_pred))

rfc_acc = accuracy_score(y_test,rfc_pred)

In [None]:
from xgboost import XGBClassifier
xgc = XGBClassifier(use_label_encoder = False)
xgc.fit(X_train,y_train)
xgc_pred = xgc.predict(X_test)

In [None]:
# Random forest roughly matches logistic regression in performance

print(classification_report(y_test,xgc_pred))
print('\n')
print(confusion_matrix(y_test,xgc_pred))

xgc_acc = accuracy_score(y_test,xgc_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(2)
knc.fit(X_train,y_train)
knc_pred = knc.predict(X_test)

In [None]:
print(classification_report(y_test,knc_pred))
print('\n')
print(confusion_matrix(y_test,knc_pred))

knc_acc = accuracy_score(y_test,knc_pred)

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)

In [None]:
print(classification_report(y_test,svc_pred))
print('\n')
print(confusion_matrix(y_test,svc_pred))

svc_acc = accuracy_score(y_test,svc_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
gnb_pred = gnb.predict(X_test)

In [None]:
print(classification_report(y_test,gnb_pred))
print('\n')
print(confusion_matrix(y_test,gnb_pred))

gnb_acc = accuracy_score(y_test,gnb_pred)

# Sex classification results

In [None]:
# The poor success here is due to the similarity between male and female possums.
results = pd.DataFrame({'Model': ['Logistic Regression', 'Decision Tree Classifier',\
                                  'Random Forest Classifier', 'XGBoost Classifier',\
                                  'KNeighbors Classifier', 'SVC', 'GaussianNB'], 'Accuracy':\
                       [lr_acc, dtc_acc, rfc_acc,xgc_acc, knc_acc, svc_acc, gnb_acc]})
results

# KNeighbors wins!

# Optimizing KNeighbors

In [None]:
knc_i_accs = []

for i in range(1,41):
    knc_i = KNeighborsClassifier(i)
    knc_i.fit(X_train,y_train)
    knc_i_pred = knc_i.predict(X_test)
    knc_i_acc = accuracy_score(y_test, knc_i_pred)
    knc_i_accs = np.append(knc_i_accs, knc_i_acc)


In [None]:
# k = 2 yields the highest accuracy. such a low k is suspect and not likely to tranfser well to other
# data points

plt.plot(np.arange(1,41), knc_i_accs, marker = 'o', linestyle = '--')
plt.xlabel('k')
plt.ylabel('accuracy')