In [1]:
#importing all the nexessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
import statistics
from sklearn.svm import SVC
import warnings

# Explore and prepare the data

__Load the data. How many responses and variables do we have?__

In [2]:
#Load the data
wvs = pd.read_csv('wvs.csv.bz2', sep='\t')

In [3]:
#Validating the data
wvs.head()

Unnamed: 0,V2,V4,V5,V6,V7,V8,V9,V10,V11,V12,...,MN_228S8,MN_229A,MN_230A,MN_233A,MN_237B1,MN_249A1,MN_249A3,I_RELIGBEL,I_NORM1,I_VOICE1
0,12,1,1,1,-2,1,1,2,1,1,...,3,-3,-3,-3,-3,1,1,0.0,1.0,0.0
1,12,1,2,3,4,2,2,2,2,2,...,3,-3,-3,-3,-3,2,-1,0.0,1.0,0.66
2,12,1,3,2,4,2,1,2,2,2,...,4,1,1,2,-3,1,1,0.0,1.0,0.33
3,12,1,1,3,4,3,1,2,1,2,...,2,2,1,2,-3,1,2,0.0,1.0,0.0
4,12,1,1,1,2,1,1,1,3,2,...,2,2,1,2,-3,1,2,0.0,1.0,0.66


In [4]:
#Checking the number of rows and columns
wvs.shape

(90350, 328)

There are totally 90350 rows(responses) in the data and 328 columns(variables).

__Q: Create a summary table over all responses for V204:__

In [5]:
#Creating a summary for column 'V204'
wvs['V204'].describe()

count    90350.000000
mean         2.946386
std          2.964040
min         -5.000000
25%          1.000000
50%          2.000000
75%          5.000000
max         10.000000
Name: V204, dtype: float64

__Q: Now remove missings.__

In [6]:
#Checking number of positive answers
wvs[wvs['V204'] > 0].shape
#85742 positive responses

(85742, 328)

There are totally 85742 rows consisting of positive responses to abortion.
Out of 90350, approximately 94% have positive responses towards abortion

In [7]:
#Removing the missings
wvs2 = wvs[(wvs['V204']>0) & (wvs['V2']>0)]
wvs2 = wvs2.dropna()

In [8]:
#Checking the number of rows after removing the missings
wvs2.shape

(79267, 328)

Final number of observations are 79267

__Q: In order to simplify the analysis below, create a new binary variable abortion__

In [9]:
#Making a binary column known as abortion
wvs2['abortion'] = np.where(wvs2['V204']>3, 1, 0)

In [10]:
#Confirms the binary responses
wvs2['abortion'].unique()

array([0, 1], dtype=int64)

__Q: Compute (pearson) correlation table between abortion and all other variables in the data.__

In [11]:
#Computing the correlation table in descending order
wvs2.corr()['abortion'].sort_values(ascending=False).head()

abortion    1.000000
V204        0.881048
V205        0.548653
V203        0.485419
V206        0.446394
Name: abortion, dtype: float64

In [12]:
wvs2.corr()['abortion'].sort_values(ascending=False).tail()

V138   -0.142894
V255   -0.149844
V223   -0.165924
V252   -0.191483
V152   -0.315280
Name: abortion, dtype: float64

__Q: convert country code V2 into dummies.__

In [13]:
#Renaming V2 to country
wvs2 = wvs2.rename(columns = {'V2': 'country'})

In [14]:
#Checking the validity of the change
wvs2.country.unique()

array([ 12,  32,  51,  36,  31,  48, 112,  76, 170, 196, 152, 156, 218,
       233, 268, 276, 288, 344, 356, 368, 392, 400, 398, 417, 422, 434,
       458, 484, 504, 528, 554, 566, 586, 275, 604, 608, 616, 634, 642,
       643, 646, 702, 705, 410, 710, 724, 752, 158, 764, 780, 788, 792,
       804, 840, 858, 860, 887, 716], dtype=int64)

In [15]:
#Converting country to dummy variables
wvs_dum = pd.get_dummies(wvs2, columns = ['country'])
wvs_dum = wvs_dum.dropna()

In [16]:
#Checking column names
wvs_dum.columns

Index(['V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13',
       ...
       'country_752', 'country_764', 'country_780', 'country_788',
       'country_792', 'country_804', 'country_840', 'country_858',
       'country_860', 'country_887'],
      dtype='object', length=386)

In [17]:
#Checking how many extra columns we got
wvs_dum.shape

(79267, 386)

The new dataset after creating the dummy columns now contains 79267 rows and 386 columns.
There are totally __58__ new dummy columns for country

# Implement Cross-Validation

In [75]:
#Creating a function for K-fold CV
def kfold2(k, model, X, y):
    #Creating lists to keep track of the stats of each k
    fscore, accuracy, precision, recall = [], [], [], []
    #Making data
    data = pd.concat([X,y], axis = 1)
    #Shuffling data
    data = shuffle(data)
    #Creating folds
    folds = np.array_split(data, k)
    for i in range(k):
        #Selecting necessary indexes
        train_data = folds.copy()
        test_data = folds[i]
        del train_data[i]
        train_data = pd.concat(train_data, sort=False)
        #Making response and predictor variable
        X_test = test_data[y.columns]
        X_train = train_data[y.columns]
        y_test = test_data[X.columns]
        y_train = train_data[X.columns]
        #Fitting the model
        m = model.fit(y_train, X_train.values.ravel())
        #Predicting the validation set
        pred = m.predict(y_test)
        accuracy.append(accuracy_score(pred, X_test))
        fscore.append(f1_score(pred, X_test))
        precision.append(precision_score(pred, X_test))
        recall.append(recall_score(pred, X_test))
    return([statistics.mean(accuracy), statistics.mean(fscore), statistics.mean(precision), statistics.mean(recall)])

In the above cell I have made the necessary function to take the number of folds, the unfitted model, X and y and generate the mean statistics.

# Find the best model

# KNN

In [76]:
#Taking a sample of the data since my computer can't run the whole set
wvs_dum_sample = wvs_dum.sample(n=10000)

In [77]:
#Calling the KNN function with one neighbor and k=5
X = pd.DataFrame(wvs_dum_sample.loc[:, wvs_dum.columns != 'abortion'])
y = pd.DataFrame(wvs_dum_sample.abortion)
knn = KNeighborsClassifier(n_neighbors=1)
res = kfold2(5, knn, X, y)
#Printing out the results
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])


Mean accuracy of the result: 0.7726999999999999
Mean fscore of the result: 0.6831297018567125
Mean precision of the result: 0.6787182847703916
Mean recall of the result: 0.687970227738423


In [78]:
wvs_dum_sample = wvs_dum.sample(n=10000)

In [79]:
#Calling the KNN function with one neighbor and k=10
X = pd.DataFrame(wvs_dum_sample.loc[:, wvs_dum.columns != 'abortion'])
y = pd.DataFrame(wvs_dum_sample.abortion)
knn = KNeighborsClassifier(n_neighbors=1)
res = kfold2(10, knn, X, y)
res
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 0.7656666666666666
Mean fscore of the result: 0.6792828917568031
Mean precision of the result: 0.6765256136319355
Mean recall of the result: 0.6835062960510286


For the above question, I have ran the KNN twice once with 5 folds and once with 10 folds. The accuracy hasn't changed much with changing the number of folds.

The accuracy for k=5 is 77% and k=10 is 76.5%

There is no significant difference in the accuracy after changing the k value for the same data set.


# Logistic regression

In [80]:
#Hacky fix for removing warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=433)

In [81]:
#Making a logistic regression model and sending it to the function
X = pd.DataFrame(wvs_dum.loc[:, wvs_dum.columns != 'abortion'])
y = pd.DataFrame(wvs_dum.abortion)
lmodel = LogisticRegression()
res = kfold2(10, lmodel, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 1.0
Mean fscore of the result: 1.0
Mean precision of the result: 1.0
Mean recall of the result: 1.0


Surprisingly, the accuracy, f-score, presicion and recall for the Logistic regression model is 1.0. The Logistic Regression model is run on the whole data set not a sample.

# SVM

In [82]:
#Hacky fix to remove warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=196)

In [83]:
#Running the SVM classifier with linear kernel.
wvs_dum_sample = wvs_dum.sample(n=6000)
X = pd.DataFrame(wvs_dum_sample.loc[:, wvs_dum_sample.columns != 'abortion'])
y = pd.DataFrame(wvs_dum_sample.abortion)
svclassifier = SVC(kernel='linear')
res = kfold2(10, svclassifier, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 1.0
Mean fscore of the result: 1.0
Mean precision of the result: 1.0
Mean recall of the result: 1.0


The accuracy for this model is 100%.

In [84]:
#Running the SVM classifier with polynomial kernel with 1 degree
svclassifier = SVC(kernel='poly', degree=1)
res = kfold2(10, svclassifier, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 0.9881666666666666
Mean fscore of the result: 0.9834218162146652
Mean precision of the result: 0.9785953289522696
Mean recall of the result: 0.9883513068699523


In [85]:
#Running the SVM classifier with polynomial kernel with 8 degrees
svclassifier = SVC(kernel='poly', degree=8)
res = kfold2(10, svclassifier, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 0.9585
Mean fscore of the result: 0.9416368806586395
Mean precision of the result: 0.9293182984692685
Mean recall of the result: 0.9544337904221653


As we can see above, the SVC classifier with degree 8 reduced the precision and recall from the linear model.

While the linear model gave the mean statistics as 1, the polynomial model gave 98% and 95% with degree=1 and degree=8 respectively.


__3.4 Compare the models__

__1. Finally, compare the models. Which ones performed the best in terms of accuracy? Which ones in terms of F-score? Did you encounter other kind of issues with certain models? Which models were fast and which ones slow?__

The logistic Regression model gave the best accuracy and F-score. It was 1.0 for all the statistics mean. The model performed the best for LR. 
No particular issues were encountered while running the models. 
KNN and SVM were relatively slower as compared to LR. I could only run a sample of the data through KNN. My system froze when I tried running the entire data set.

__2. If you have to repeat the exercise with a single model (and you have, see below), which one will you pick?__

For repeating the execise, I would pick two models which are the LR model and the SVM model.

LR model because it gave the best accuracy and SVM model because it gave comparable results on different kernels. Realistically, to compare the effect of a variable, I would test it using both the models.

# How large a role does country play?

Running on LR model first. It gave the best accuracy and f-score(1.0)

In [86]:
#Making a logistic regression model and sending it to the function
X = pd.DataFrame(wvs_dum.loc[:, wvs_dum.columns != 'abortion'])
y = pd.DataFrame(wvs_dum.abortion)
lmodel = LogisticRegression()
res = kfold2(10, lmodel, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 1.0
Mean fscore of the result: 1.0
Mean precision of the result: 1.0
Mean recall of the result: 1.0


In [87]:
#Making a logistic regression model and sending it to the function
X = pd.DataFrame(wvs2.loc[:, wvs2.columns != 'abortion'])
y = pd.DataFrame(wvs2.abortion)
lmodel = LogisticRegression()
res = kfold2(10, lmodel, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 1.0
Mean fscore of the result: 1.0
Mean precision of the result: 1.0
Mean recall of the result: 1.0


I ran the tests on the best model, i.e., Logistic Regression model which previously gave an accuracy of 1.0. The tests were run with and without countries.
There was no difference in the accuracy with or without the countries. This was expected since accuracy was 1.0.
To see a realistic difference in accuracy and understanding whether countries make a difference, I am running the same tests on an SVC classifier with poly kernel and degree=8

Now I am running the SVC model with poly kernel and degree 8 since it gave a comparable accuracy

In [88]:
#With countries running on SVC with poly kernel and degree 8
wvs_dum_sample = wvs_dum.sample(n=10000)
X = pd.DataFrame(wvs_dum_sample.loc[:, wvs_dum_sample.columns != 'abortion'])
y = pd.DataFrame(wvs_dum_sample.abortion)
svclassifier = SVC(kernel='poly', degree=8)
res = kfold2(10, svclassifier, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 0.9728
Mean fscore of the result: 0.961885699625286
Mean precision of the result: 0.9544931157831611
Mean recall of the result: 0.9695475365318793


In [89]:
#Without countries running on SVC with poly kernel and degree 8
wvs2_sample = wvs2.sample(n=10000)
X = pd.DataFrame(wvs2_sample.loc[:, wvs2_sample.columns != 'abortion'])
y = pd.DataFrame(wvs2_sample.abortion)
svclassifier = SVC(kernel='poly', degree=8)
res = kfold2(10, svclassifier, X, y)
print('Mean accuracy of the result:', res[0])
print('Mean fscore of the result:', res[1])
print('Mean precision of the result:', res[2])
print('Mean recall of the result:', res[3])

Mean accuracy of the result: 0.9639
Mean fscore of the result: 0.9498898512718916
Mean precision of the result: 0.95006228069222
Mean recall of the result: 0.9497857203141739


Even after the data was tested with the SVC classifier, there was no significant difference with or without countries. 

The accuracy was 97% with country dummies and 96.3% without country dummies, therefore confirming that countries make very little impact.

__Therefore, the country information does not help to noticeably improve the prediction.__

__DONE__