# Import all packages

In [103]:
import pandas as pd
import random
import numpy as np
from datetime import datetime as dt
from multiprocessing import Pool
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, Imputer
from sklearn.metrics import mean_squared_error as mse, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# Import data and structure it
### Write wanted sample, set 0 if all data wanted:


In [76]:
sample_dataset = 10000

In [77]:
#IMPORTANT : THIS IS IN EXCEL FORMAT, CHANGE TO CSV IF YOUR FILE IS NOT XLSX
df_q_list = pd.read_excel("question_data.xlsx")

print("Making sample data")
# TAKE A SAMPLE OF DATA
filename = "user_data_public.csv"

if sample_dataset != 0:
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    s = sample_dataset #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
    df_master = pd.read_csv(filename, 
                            skiprows=skip, 
                            dtype = object)
else:
    df_master = pd.read_csv(filename, skiprows=skip, dtype = object)
print("Finished making sample data")

# Take only questions with n answers and put the questions in a column
#df_q.N.plot(kind="hist")
n_answers = 20000
df_keep = pd.DataFrame()
df_keep['keep_questions'] = df_q_list[df_q_list["N"]>n_answers].index

#Keep only questions that are also in master dataframe and make new dataframe with most answered questions
df_keep = df_keep[df_keep["keep_questions"].isin(list(df_master.columns.values))]
df_master_keep = df_master[list(df_keep["keep_questions"])]

#Count not missing each row i.e: number of answered questions by each preson:
df_master_keep["answered_questions"] = df_master_keep.notnull().sum(axis=1).copy()

#Choose only rows with at least n answered questions
n_questions = 400
df_clean_v1 = df_master_keep[df_master_keep["answered_questions"]>n_questions]

#Create dummy for sexual orientation, where 1 = NOT STRAIGHT
print("Now creating Dummy of sexual orientation")
sexual_orientation = []
for x in df_clean_v1['d_orientation']:
    if pd.isnull(x):
        sexual_orientation.append(1) #Since we have persons answered over a 1000 questions, so if they have not answered their sexual orientation we assume they are either uncertain or dont want to disclose because of fear of persecution/judgement from peers
    elif x != "Straight":
        sexual_orientation.append(1)
    else:
        sexual_orientation.append(0)
        
#Turn it into a dataframe and put labels from dataframe on
y = pd.DataFrame(sexual_orientation, index = df_clean_v1.index)

# Drop sexual orientation from master dataframe
df_clean_v1.drop(columns = ["d_orientation"], inplace = True)
print()

#Remove the most problematic questions, and them make all of the categorical variables dummiew
df_prep = df_clean_v1.drop(columns = ["q1401", "q80928", "q546", "q1040"])
df_dummy = pd.get_dummies(df_prep, dummy_na = False) #When missing giver 0 to all dummies created for the quesion with the missing value
#df_dummy.fillna(value = -1, inplace = True) #If missing insert -1 # This code does not work
X = df_dummy

#INSERT MISSING VALUES BACK INTO THE DUMMIES FOR LATER IMPUTER
for i in list(df_keep["keep_questions"].head()):
    X.loc[df_prep[i].isnull(), X.columns.str.startswith(i+"_")] = np.nan
    
print("Structuring complete")

Making sample data
Finished making sample data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Now creating Dummy of sexual orientation

Structuring complete


### STRUCTURING COMPLETE

# NOW DEFINING FUNCTIONS AND SPLITTING DATASET

### Defining functions to be used for parallel computing

In [98]:
def tree_paralel(x):
    tree = DecisionTreeClassifier(criterion="gini", max_depth= x, random_state=1)  
    accuracy_ = []
    for train_idx, val_idx in kfolds.split(X_dev, y_dev):

        X_train, y_train, = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 
        
        X_train = pd.DataFrame(im.fit_transform(X_train),index = X_train.index)
        X_val = pd.DataFrame(im.transform(X_val), index = X_val.index)
        tree.fit(X_train, y_train)
        y_pred = tree.predict(X_val)
        accuracy_.append(accuracy_score(y_val, y_pred))
    print("This was the "+str(x)+" iteration", (dt.now() - start).total_seconds())
    return accuracy_

def forest_paralel(x):
    forest = RandomForestClassifier(criterion="gini", max_depth= x, random_state=1)  
    accuracy_ = []
    for train_idx, val_idx in kfolds.split(X_dev, y_dev):

        X_train, y_train, = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 
        
        X_train = pd.DataFrame(im.fit_transform(X_train),index = X_train.index)
        X_val = pd.DataFrame(im.transform(X_val), index = X_val.index)
        forest.fit(X_train, y_train.values.ravel())
        y_pred = forest.predict(X_val)
        accuracy_.append(accuracy_score(y_val, y_pred))
    print("This was the "+str(x)+" iteration", (dt.now() - start).total_seconds())
    return accuracy_

def logit_reg(c_param):
    logit_pipe = make_pipeline(LogisticRegression(random_state= 1, C = c_param)) 
    accuracy_ = []
    for train_idx, val_idx in kfolds.split(X_dev, y_dev):
        X_train, y_train, = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx] 
        
        X_train = pd.DataFrame(im.fit_transform(X_train),index = X_train.index)
        X_val = pd.DataFrame(im.transform(X_val), index = X_val.index)

        logit_pipe.fit(X_train, y_train.values.ravel())
        y_pred = logit_pipe.predict(X_val)
        accuracy_.append(accuracy_score(y_val, y_pred))  
    print("This was the "+str(x)+" iteration", (dt.now() - start).total_seconds())
    return accuracy_

### We make a sample where 80pct. of the data is used for development, and the number of kfolds to be used and type of imputer to replace the missing values

In [79]:
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

kfolds = KFold(n_splits=10)

# Missing in the dummies will be set to the mean of the answers by default
im = Imputer()

# Logit, Decisiontree and Randomforest

### Logit regression, where we itterate over different values for the regularization parameter and keep the optimal

In [80]:
# RUN THE KFOLDS TO FINT THE OPTIMAL HYPERPARAMETER

p = Pool(4)

input_ = np.linspace(0.1,1,10)
output_ = []
accuracy = []
start = dt.now()
for result in p.imap(logit_reg, input_):
    output_.append(result)
p.close()
temp = pd.DataFrame(output_).mean(axis=1)
temp.index = input_
optimal_l = temp.nlargest(1)
print("Time:", (dt.now() - start).total_seconds())
print("Optimal hyperparameter: "+ str(optimal_l.index[0]) + " with accuracy: " + str(optimal_l.values) )

Time: 91.133668
Optimal hyperparameter: 0.6 with accuracy: [ 0.90936634]


In [117]:
# RUN THE LOGIT WITH OPTIMAL INDEX

logit_pipe = make_pipeline(LogisticRegression(random_state= 1, C = optimal_l.index[0]))


X_dev = pd.DataFrame(im.fit_transform(X_dev), index= X_dev.index)
X_test = pd.DataFrame(im.transform(X_test), index= X_test.index)

logit_pipe.fit(X_dev, y_dev.values.ravel())
predict = pd.DataFrame(logit_pipe.predict(X_test),index = y_test.index)
print(accuracy_score(y_test, predict))
print()
print(print(confusion_matrix(y_test,predict)))

0.901913875598

[[680  29]
 [ 53  74]]
None


# DECISION TREE

In [102]:
p = Pool(4)

input_ = range(1,11)
output_ = []
accuracy = []
start = dt.now()
for result in p.imap(tree_paralel, input_):
    output_.append(result)
p.close()
temp = pd.DataFrame(output_).mean(axis = 1)
temp.index = input_
optimal_t = temp.nlargest(1)
print("Time:", (dt.now() - start).total_seconds())
print("Optimal hyperparameter: "+ str(optimal_t.index[0]) + " with accuracy: " + str(optimal_t.values) )

This was the 1 iteration 2504.129164
This was the 2 iteration 2505.684552
This was the 3 iteration 2507.244761
This was the 4 iteration 2508.687648
This was the 5 iteration 2520.971194
This was the 6 iteration 2523.884034
This was the 7 iteration 2526.930939
This was the 8 iteration 2529.314511
This was the 9 iteration 2539.327477
This was the 10 iteration 2541.560915
Time: 51.497737
Optimal hyperparameter: 6 with accuracy: [ 0.94465725]


In [114]:
tree = DecisionTreeClassifier(criterion='gini', max_depth= optimal_t.index[0], random_state=1)

X_dev = pd.DataFrame(im.fit_transform(X_dev), index= X_dev.index)
X_test = pd.DataFrame(im.transform(X_test), index= X_test.index)


tree.fit(X_dev, y_dev)
predict = pd.DataFrame(tree.predict(X_test),index = y_test.index)
print(accuracy_score(y_test, predict))
print()
print(confusion_matrix(y_test,predict))

0.941387559809

[[687  22]
 [ 27 100]]


# RANDOM FOREST

In [93]:
p = Pool(4)

input_ = range(10,21)
output_ = []
accuracy = []
start = dt.now()
for result in p.imap(forest_paralel, input_):
    output_.append(result)
p.close()
temp = pd.DataFrame(output_).mean(axis = 1)
temp.index = input_
optimal_f = temp.nlargest(1)
print("Time:", (dt.now() - start).total_seconds())
print("Optimal hyperparameter: "+ str(optimal_f.index[0]) + " with accuracy: " + str(optimal_f.values) )

This was the 10 iteration 75.521591
This was the 11 iteration 75.75712
This was the 12 iteration 75.945254
This was the 13 iteration 76.07596
This was the 14 iteration 89.407084
This was the 15 iteration 89.741772
This was the 16 iteration 89.860972
This was the 17 iteration 90.150475
This was the 18 iteration 100.996613
This was the 19 iteration 101.217193
This was the 20 iteration 101.299086
Time: 39.269115
Optimal hyperparameter: 12 with accuracy: [ 0.85043614]


In [119]:
forest = RandomForestClassifier(criterion='gini', max_depth= optimal_f.index[0], random_state=1)

X_dev = pd.DataFrame(im.fit_transform(X_dev), index= X_dev.index)
X_test = pd.DataFrame(im.transform(X_test), index= X_test.index)


forest.fit(X_dev, y_dev.values.ravel())
predict = pd.DataFrame(forest.predict(X_test),index = y_test.index)
print(accuracy_score(y_test, predict))
print()
print(confusion_matrix(y_test,predict))

0.876794258373

[[705   4]
 [ 99  28]]


In [None]:
from sklearn.model_selection import validation_curve

n_estimators_rng = np.unique(np.logspace(0,2,20).astype(np.int64))

clf_rf = RandomForestClassifier(n_estimators=10,random_state=1, max_depth= optimal_f.index[0])
X = pd.DataFrame(im.fit_transform(X), index= X.index)

train_scores, test_scores = \
    validation_curve(estimator=clf_rf, 
                     X=X, 
                     y=y,
                     param_name='n_estimators', 
                     param_range=n_estimators_rng,
                     cv=5)
    
f,ax = plt.subplots()

ax.plot(n_estimators_rng, np.mean(train_scores, 1), label='Test scores')
ax.plot(n_estimators_rng, np.mean(test_scores, 1), label='Train scores') 

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
