# Library Loading

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from scipy.stats import randint
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import os
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.neighbors import LocalOutlierFactor
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import linear_model
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import LabelEncoder
import random

random.seed(0)
np.random.seed(0)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Features selection (with RF)
Select the most informative features based on the importance given by random forest classifier.

##  Gender


In [None]:
# Load the dataset
df = pd.read_csv('final_data_gender.csv')

# Observe the dataset
df.head()

In [None]:
# Turning gender into dummy: male = 1
df['gender'] = pd.get_dummies(df['gender'])['male']

In [None]:
# Select columns for X and y
X = df.drop(['gender', 'age_group', 'age_order', 'panelist_id', 'social_status', 'social_status_order'], axis=1)
y = df['gender']

In [None]:
# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0, shuffle = True)

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=2000, 
    random_state=0, 
    criterion='gini', 
    min_samples_leaf = 20)

rf.fit(X_train, y_train)

In [None]:
# Extract importance values for each feature (column of X)
importances = rf.feature_importances_

# Create a dataframe to store the values and their labels
df2 = pd.DataFrame({'feature': X_train.columns, 'importance': importances})

# Sort dataframe by descending order, showing the most important feature top
df2 = df2.sort_values('importance', ascending = False)

# Plot the importance of each feature
df2.plot(kind='bar', x='feature', figsize = (20,8))
plt.show()
df2[:31]

In [None]:
# Export the important words for gender
df_export_gender = df[['panelist_id'] + df2['feature'][:31].tolist() + ['gender']]
df_export_gender.head()

In [None]:
df_export_gender.to_csv('gender_selected.csv', index = False)

## Age

In [None]:
# Load the dataset
df = pd.read_csv('final_data_age.csv')

# Turning gender into dummy: male = 1
df['gender'] = pd.get_dummies(df['gender'])['male']
df.head()

In [None]:
# Select columns for X and y
X = df.drop(['age_order', 'age_group','gender', 'panelist_id', 'social_status','social_status_order'], axis=1)
y = df['age_order']

In [None]:
# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0, shuffle = True)

In [None]:
# Recode the y labels
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.fit_transform(y_test)

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=2000, 
    random_state=0, 
    criterion='gini', 
    min_samples_leaf = 20)
rf.fit(X_train, y_train)

In [None]:
# Extract importance values for each feature (column of X)
importances = rf.feature_importances_

# Create a dataframe to store the values and their labels
df2 = pd.DataFrame({'feature': X_train.columns, 'importance': importances})

# Sort dataframe by descending order, showing the most important feature top
df2 = df2.sort_values('importance', ascending = False)

# Plot the importance of each feature
df2.plot(kind='bar', x='feature', figsize = (20,8))
plt.show()
df2[:31]

In [None]:
# Export the important words for age
df_export_age = df[['panelist_id'] + df2['feature'][:31].tolist() + ['age_order']]
df_export_age.head()

In [None]:
df_export_age.to_csv('age_selected.csv',index=False)

# Load the data

## Gender

### Whole dataset

In [None]:
# Load the dataset
df_gender_whole = pd.read_csv('final_data_gender.csv')

In [None]:
# Turning gender into dummy: male = 1
df_gender_whole['gender'] = pd.get_dummies(df_gender_whole['gender'])['male']

In [None]:
# Select columns for X and y
X_gender_whole = df_gender_whole.drop(['gender', 'age_group', 'age_order', 'panelist_id', 'social_status', 'social_status_order'], axis=1)
y_gender_whole = df_gender_whole['gender']

### Selected features

In [None]:
# Load the dataset
df_gender_selected = pd.read_csv('gender_selected.csv')
df_gender_selected.head()

In [None]:
# Select columns for X and y
X_gender_selected = df_gender_selected.drop(['gender', 'panelist_id'], axis=1)
y_gender_selected = df_gender_selected['gender']
X_gender_selected.head()
y_gender_selected.head()

## Age

### Whole dataset

In [None]:
# Load the dataset
df_age_whole = pd.read_csv('final_data_age.csv')

In [None]:
# Turning gender into dummy: male = 1
df_age_whole['gender'] = pd.get_dummies(df_age_whole['gender'])['male']

In [None]:
# Select columns for X and y
X_age_whole = df_age_whole.drop(['gender', 'age_group', 'age_order', 'panelist_id', 'social_status', 'social_status_order'], axis=1)
y_age_whole = df_age_whole['age_order']

### Selected features

In [None]:
# Load the dataset
df_age_selected = pd.read_csv('age_selected.csv')

In [None]:
# Select columns for X and y
X_age_selected = df_age_selected.drop(['age_order', 'panelist_id'], axis=1) #age_group
y_age_selected = df_age_selected['age_order']

# Model Functions

## Random Forest

In [None]:
def RF(X_train, X_test, y_train, y_test):
  
    # Create distributions for randomized search
    param_xgb = {'min_child_weight': [1, 5, 10],
                 'gamma': [0.5, 1, 1.5],
                 'subsample': [0.3, 0.6, 0.8, 1.0],
                 'colsample_bytree': [0.3, 0.6, 0.8, 1.0],
                 'max_depth': [5],
                 'learning_rate': [0.03, 0.05, 0.1]}

    # Randomized search with cross validation
    randSearch_xgb = RandomizedSearchCV(
        xgb.XGBClassifier(objective='binary:logistic',
                          n_estimators=100,
                          alpha = 10,
                          random_state = 0),
        param_xgb,
        n_iter=5,
        cv=5,
        random_state = 0)

    # Fit the random search object
    randSearch_xgb.fit(X_train, y_train)

    # Best estimators
    best_xgb_clf = randSearch_xgb.best_estimator_
    # print("Best CV accuracy:", round((randSearch_xgb.best_score_ * 100), 4), '%')
    # print("Best hyperparameters:", randSearch_xgb.best_params_)

    # Predict on the test set using the best performing parameters
    y_pred = best_xgb_clf.predict(X_test)

    # Accuracy score
    best_xgb_clf_score = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", round((best_xgb_clf_score * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # Classification report
    print(metrics.classification_report(y_test, y_pred))
  

    return best_xgb_clf

## Naive Bayes 

In [None]:
def NB(X_train, X_test, y_train, y_test):

    # Fit the naive bayes model
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    # Predict on the test set
    y_pred = gnb.predict(X_test)

    # Accuracy score
    test_acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", round((test_acc * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # Classification report
    print(metrics.classification_report(y_test, y_pred))
  

    return gnb

## KNN

In [None]:
def KNN(X_train, X_test, y_train, y_test):
  
    # Create a grid for grid search
    param_grid = {'n_neighbors': range(1,15)}

    # Detect and remove outliers using LOF
    lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
    outlier_mask = lof.fit_predict(X_train) != -1
    X_train_2 = X_train[outlier_mask]
    y_train_2 = y_train[outlier_mask]

    # Create a KNN classifier
    knn_clf = KNeighborsClassifier()

    # Use GridSearchCV to find the best value for k
    grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
    grid_search.fit(X_train_2, y_train_2)

    # Best estimators
    best_knn = grid_search.best_estimator_
    # print("Best value for k:", grid_search.best_params_['n_neighbors'])
    # print("Best CV accuracy:", round((grid_search.best_score_ * 100), 4), '%')

    # Predict on the test set
    y_pred = best_knn.predict(X_test)

    # Accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", round((accuracy * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # Classification report
    print(metrics.classification_report(y_test, y_pred))


    return best_knn

## GLM

In [None]:
def GLM(X_train, X_test, y_train, y_test):

    # Fit the logistic regression model
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    # Predict on the test set
    y_pred = logreg.predict(X_test)

    # Accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    print('Test Accuracy:', round((accuracy * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # Classification report
    print(metrics.classification_report(y_test, y_pred))


    return logreg

## Neural Network

In [None]:
def nn(X_train, X_test, y_train, y_test):

    accuracy_agg = []


    param_nnet = {"hidden_layer_sizes": [(n, n, n, n) for n in range(2, 6)]}

    randSearch_nnet = GridSearchCV(MLPClassifier(activation = 'relu', solver = 'sgd', random_state = 1),
                                   param_grid = param_nnet,
                                   cv = 5)

    randSearch_nnet.fit(X_train, y_train)

    # Best estimators
    best_nnet = randSearch_nnet.best_estimator_
    # print("Best CV accuracy:", round((randSearch_nnet.best_score_ * 100), 4), '%')
    # print("Best hyperparameters:", randSearch_nnet.best_params_)

    # Predict on the test set
    y_pred = best_nnet.predict(X_test)

    # Accuracy score
    best_nnet_score = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", round((best_nnet_score * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))

    # Classification report
    print(metrics.classification_report(y_test, y_pred))

    return best_nnet

## Stacking

In [None]:
def stack(X_train, X_test, y_train, y_test, models):

    lr = LogisticRegression(random_state=0)

    sclf = StackingClassifier(estimators = models,
                              final_estimator = lr,
                              cv = 5)
  
    sclf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = sclf.predict(X_test)
    
    # Accuracy score
    sclf_score = accuracy_score(y_test,y_pred)
    print("Test Accuracy:", round((sclf_score * 100), 4), '%')

    # Confusion matrix
    print(confusion_matrix(y_test, y_pred))
    
    # Classification report
    print(metrics.classification_report(y_test, y_pred))

# Gender Prediction

## [Whole dataset]

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_gender_whole, y_gender_whole, test_size=0.2, random_state = 0, shuffle = True)

### Normalisation

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Models

In [None]:
%%time
best_xgb_clf = RF(X_train, X_test, y_train, y_test)

In [None]:
%%time
gnb = NB(X_train, X_test, y_train, y_test)

In [None]:
%%time
best_knn = KNN(X_train, X_test, y_train, y_test)

In [None]:
%%time
logreg = GLM(X_train, X_test, y_train, y_test)

In [None]:
%%time
best_nnet = nn(X_train, X_test, y_train, y_test)

In [None]:
%%time
models = [('xgb', best_xgb_clf),
          ('gnb', gnb),
          ('knn', best_knn),
          ('lr', logreg),
          ('nn', best_nnet)]

stack(X_train, X_test, y_train, y_test, models)

## [Selected features]

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_gender_selected, y_gender_selected, test_size=0.2, random_state = 0, shuffle = True)

### Normalisation

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Models

In [None]:
best_xgb_clf = RF(X_train, X_test, y_train, y_test)

In [None]:
gnb = NB(X_train, X_test, y_train, y_test)

In [None]:
best_knn = KNN(X_train, X_test, y_train, y_test)

In [None]:
logreg = GLM(X_train, X_test, y_train, y_test)

In [None]:
best_nnet = nn(X_train, X_test, y_train, y_test)

In [None]:
models = [('xgb', best_xgb_clf),
          ('gnb', gnb),
          ('knn', best_knn),
          ('lr', logreg),
          ('nn', best_nnet)]

stack(X_train, X_test, y_train, y_test, models)

# Age Prediction

## [Whole dataset]

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_age_whole, y_age_whole, test_size=0.2, random_state = 0, shuffle = True)

### Normalisation

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.fit_transform(y_test)

### Models

In [None]:
best_xgb_clf = RF(X_train, X_test, y_train, y_test)

In [None]:
gnb = NB(X_train, X_test, y_train, y_test)

In [None]:
best_knn = KNN(X_train, X_test, y_train, y_test)

In [None]:
logreg = GLM(X_train, X_test, y_train, y_test)

In [None]:
best_nnet = nn(X_train, X_test, y_train, y_test)

In [None]:
models = [('xgb', best_xgb_clf),
          ('gnb', gnb),
          ('knn', best_knn),
          ('lr', logreg),
          ('nn', best_nnet)]

stack(X_train, X_test, y_train, y_test, models)

## [Selected features]

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_age_selected, y_age_selected, test_size=0.2, random_state = 0, shuffle = True)

### Normalisation

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.fit_transform(y_test)

### Models

In [None]:
best_xgb_clf = RF(X_train, X_test, y_train, y_test)

In [None]:
gnb = NB(X_train, X_test, y_train, y_test)

In [None]:
best_knn = KNN(X_train, X_test, y_train, y_test)

In [None]:
logreg = GLM(X_train, X_test, y_train, y_test)

In [None]:
best_nnet = nn(X_train, X_test, y_train, y_test)

In [None]:
models = [('xgb', best_xgb_clf),
          ('gnb', gnb),
          ('knn', best_knn),
          ('lr', logreg),
          ('nn', best_nnet)]

stack(X_train, X_test, y_train, y_test, models)