In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('talk')
import datetime
from calendar import isleap
import missingno as msno
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from matplotlib.ticker import FormatStrFormatter
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MaxAbsScaler, MinMaxScaler
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, FactorAnalysis, TruncatedSVD, NMF, FastICA
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel, VarianceThreshold
from sklearn.pipeline import FeatureUnion

In [2]:
# Read transformed dataframe
df = joblib.load('J:/Source/Exercises/Exercise2/ModelingData.pkl')

# Under normal circumstances the first step I would undertake on the transformed data would be to encode
# Encoding can be done either through sklearn's one hot encoding, or through pandas get_dummies, as below:
'''pd.get_dummies(df, prefix=['channelName', 'title', 'sessionType', 'sessionSubType', 'genre', 'subGenre',
                                'episodeTitle', 'seriesTitle', 'gender'],
               columns=['channelName', 'title', 'sessionType', 'sessionSubType', 'genre', 'subGenre',
                                'episodeTitle', 'seriesTitle', 'gender'], sparse=True)'''
# Unfortunately, even while using sparse matrices, the memory requirements exceed my current machine's capabilities


#Due to hardware limitations, we need to come up with alternative solutions
#We still need to aggregate categorical labels per household, but first let's reduce the dataset

"pd.get_dummies(df, prefix=['channelName', 'title', 'sessionType', 'sessionSubType', 'genre', 'subGenre',\n                                'episodeTitle', 'seriesTitle', 'gender'],\n               columns=['channelName', 'title', 'sessionType', 'sessionSubType', 'genre', 'subGenre',\n                                'episodeTitle', 'seriesTitle', 'gender'], sparse=True)"

In [3]:
#Remove sessions with a 0 or negative length
df = df.loc[df['sessionLength'] > 0]
#Remove surf, due to our previous assumption that it is not actually a person watching a title
df = df.loc[df['title'] != 'Surf']
#Only look at normal playback speed
df = df.loc[df['playbackSpeed'] == 1000]
#Remove sessions with broadcast length < 0
df = df.loc[df['broadcastLength'] > 0]
#Do not consider sessions shorter than 15 seconds
df = df.loc[df['sessionLength'] >= 15]

#While there are concerns with removing the above pieces of information, we approach encoding again.
#Unfortunately, our machine can still not handle the amount of data
#Thus, we have 2 options:
#1 - we resort to sampling procedures
#2 - we create metrics based off of medians, quartiles, averages, etc. for each household

In [4]:
#We decide to move forward with the sampling procedure
#First we only select the data that has a target
dfTrain = df.loc[df['ageBinTarget'] != 'nan']

#Delete households with less than 10 views
counts = dfTrain['ID'].value_counts()
dfTrain = dfTrain[df['ID'].isin(counts[counts >= 10].index)]

  import sys


In [5]:
#Retrieve top 10 shows for each household
store = pd.DataFrame()
grouped = dfTrain.groupby('ID')
for name, group in grouped:
    temp = group.title.value_counts().iloc[:5].reset_index()
    topTitles = temp.T.iloc[0, :]
    topTitlesCount = temp.T.iloc[1, :]
    combined = topTitlesCount.append(topTitles).reset_index(drop=True)
    combined.rename(name, inplace=True)
    store = store.append(combined)

In [6]:
#Testing encoding on a 15% sample at this point still does not manage to provide results due to spec limits
#Delete some more columns:
del dfTrain['sessionSubType']
del dfTrain['subGenre']
del dfTrain['episodeTitle']
del dfTrain['seriesTitle']
del dfTrain['title']
del dfTrain['playbackSpeed']
del dfTrain['broadcastLength']

In [7]:
#Removal of all the columns make the sampling procedure irrelevant, hence we 'sample' all the data
sample = dfTrain.sample(frac=1, random_state=333)
dummySample = pd.get_dummies(sample, prefix=['sessionStartHour', 'sessionStartDayOfWeek', 'sessionEndHour',
                                             'sessionEndDayOfWeek','broadcastStartHour', 'broadcastStartDay',
                                             'broadcastEndHour', 'broadcastEndDay', 'channelName',
                                             'sessionType', 'genre', 'gender'],
               columns=['sessionStartHour', 'sessionStartDayOfWeek', 'sessionEndHour',
                                             'sessionEndDayOfWeek','broadcastStartHour', 'broadcastStartDay',
                                             'broadcastEndHour', 'broadcastEndDay', 'channelName',
                                             'sessionType', 'genre', 'gender'], sparse=True)

In [8]:
#Groupby household and sum or average columns
df = dummySample.groupby(['ID', 'ageBinTarget']).sum()
temp = dummySample[['ID', 'sessionLength', 'viewingDifference']].groupby('ID').mean()
del df['sessionLength']
del df['viewingDifference']
df['sessionLength'] = temp['sessionLength']
df['viewingDifference'] = temp['viewingDifference']
df = df.reset_index()

In [9]:
#Add top 5 titles and encode
df = df.merge(store, left_on='ID', right_index=True)
df = pd.get_dummies(df, prefix=[5, 6, 7, 8, 9], columns=[5, 6, 7, 8, 9])

In [10]:
# Split dataframe into features and target
y = df.iloc[:, 1]  # .as_matrix()
X = df.iloc[:, 2:]  # .as_matrix()

# Scalings
sc = StandardScaler()
ma = MaxAbsScaler()
mm = MinMaxScaler()

# Apply scaler
colNames = X.columns
X.fillna(0, inplace=True)
X = sc.fit_transform(X)
X = pd.DataFrame(X, columns=colNames)

In [11]:
# Remove features with less than 5% variance
colNames = X.columns
sel = VarianceThreshold(threshold=0.16)
X = sel.fit_transform(X)
# Get column names back
newCols = []
for remain, col in zip(sel.get_support(), colNames):
    if remain == True:
        newCols.append(col)
X = pd.DataFrame(X, columns=newCols)
#X = transformed.merge(X.iloc[:, -5:], left_index=True, right_index=True)

In [12]:
# Perform univariate feature selection (ANOVA F-values)
colNames = X.columns
selection_Percent = SelectPercentile(percentile=5)
X = selection_Percent.fit_transform(X, y)
# Get column names back
newCols = []
for remain, col in zip(selection_Percent.get_support(), colNames):
    if remain == True:
        newCols.append(col)
X = pd.DataFrame(X, columns=newCols)
#X = transformed.merge(X.iloc[:, -5:], left_index=True, right_index=True)

In [13]:
# Perform tree-based feature selection
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
colNames = X.columns
sel = SelectFromModel(clf, prefit=True)
X = sel.transform(X)
newCols = []
for remain, col in zip(sel.get_support(), colNames):
    if remain == True:
        newCols.append(col)
X = pd.DataFrame(X, columns=newCols)

In [14]:
#Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)


In [15]:
def testClassifier(clf):
    param_grid = [{'n_estimators': range(50, 100, 10),
                   'min_samples_split': range(10, 16, 1),
                   'min_samples_leaf': range(5, 30, 5),
                   'max_leaf_nodes': (5, 30, 5)
                   }]

    grid = GridSearchCV(clf, param_grid, cv=3, verbose=1, n_jobs=-1)
    fitted_classifier = grid.fit(X_train, y_train)
    print(grid.best_score_, grid.best_params_)
    predictions = fitted_classifier.predict(X_train)

    fitted = clf.fit(X_train, y_train)
    scoresCV = cross_val_score(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1)
    trainPredictionsCV = cross_val_predict(clf, X_train, y_train, cv=3, verbose=0, n_jobs=-1)

    trainPredictions = clf.predict(X_train)
    testPredictions = clf.predict(X_test)

    score1 = metrics.accuracy_score(y_test, testPredictions)
    score2 = metrics.cohen_kappa_score(y_test, testPredictions)
    #score3 = metrics.roc_auc_score(y_test, testPredictions)
    score4 = metrics.confusion_matrix(y_test, testPredictions)
    score5 = metrics.classification_report(y_test, testPredictions)
    print('Train score: ', metrics.accuracy_score(y_train, trainPredictions))
    print('CV score: ', scoresCV)
    print('Accuracy, Cohen Kappa')#, ROC AUC Score')
    print(score1, score2)#, score3)
    print('Confusion Matrix')
    print(score4)
    print('Classification Report')
    print(score5)

In [17]:
lr = LogisticRegression(C = 0.005)
sgd = SGDClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(max_features='sqrt', max_depth=2)

#print('LR')
#testClassifier(lr)
#print('DT')
#testClassifier(dt)
#print('RF')
testClassifier(rf)

Fitting 3 folds for each of 450 candidates, totalling 1350 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   34.8s
[Parallel(n_jobs=-1)]: Done 1350 out of 1350 | elapsed:   37.4s finished


0.440124416796 {'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 80}
Train score:  0.46500777605
CV score:  [ 0.40930233  0.44392523  0.36448598]
Accuracy, Cohen Kappa
0.410094637224 0.159639920612
Confusion Matrix
[[70 30  9  0  0]
 [52 39 10  0  0]
 [17 13 21  0  0]
 [10 11 17  0  0]
 [ 5  4  9  0  0]]
Classification Report
             precision    recall  f1-score   support

   (25, 35]       0.45      0.64      0.53       109
   (35, 45]       0.40      0.39      0.39       101
   (45, 55]       0.32      0.41      0.36        51
   (55, 65]       0.00      0.00      0.00        38
  (65, 115]       0.00      0.00      0.00        18

avg / total       0.34      0.41      0.37       317



  'precision', 'predicted', average, warn_for)
