#### Setup

In [None]:
import pandas as pd
# import numpy as np

# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve

# from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from collections import Counter

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [None]:
url = 'https://raw.githubusercontent.com/sai-byui/spring22/main/week02/AI%20Society%20Survey.csv'
data = pd.read_csv(url)
data.drop(columns=['Timestamp'], inplace=True)

# Drop nan's (b/c we don't like those)
data.dropna(inplace=True)

#### Clean Up

In [None]:
data.columns = ["height", "age", "gender", "shoe_size", "dress", "twilight", "horse_duck", "office"]

le = LabelEncoder()

gender = pd.get_dummies(data['gender'])
dress = pd.get_dummies(data['dress'])
twilight = pd.get_dummies(data['twilight'])
horse_duck = pd.get_dummies(data['horse_duck'])
office = pd.get_dummies(data['office'])

data = pd.concat([data, dress, twilight, horse_duck, office], axis=1)
data.drop(columns=['dress', 'twilight', 'horse_duck', 'office'], inplace=True)

data['gender'] = le.fit_transform(data['gender'])

In [None]:
# Lets make sure our heights are realistic (index 18 was 510)
data = data.query("height <= 100").astype('float')

# Create a new correlated feature using shoe size and height
data['shoe_height'] = data.shoe_size * data.height

In [None]:
# Check out our feature correlations
import seaborn as sns
sns.heatmap(data.corr())
data.head(10)

#### Split

In [None]:
X = data.drop(columns=['gender'])
y = data['gender']

X, X_dev, y, y_dev = train_test_split(X, y, random_state=42, train_size= .8)
X, X_test, y, y_test = train_test_split(X, y, random_state=42, train_size= .8)
print(len(X), len(X_dev), len(X_test))

In [None]:
data['gender'].value_counts()

#### Model

In [None]:
model = XGBClassifier()
model.fit(X, y)

y_hat = model.predict(X_dev)
y_holdout = model.predict(X_test)

In [None]:
plot_roc_curve(model, X_dev, y_dev)
print(classification_report(y_dev, y_hat));

#### Result

In [None]:
import seaborn as sns

importance = model.feature_importances_

df = pd.DataFrame(list(zip(importance, X.columns.to_list())),
                  columns=['importance', 'feature'])

df = df.sort_values(by='importance', ascending=False)
sns.set(rc={'figure.figsize': (11.7, 8.27)})
sns.barplot(data=df, x='importance', y='feature');