In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, GridSearchCV, KFold, StratifiedKFold, RepeatedStratifiedKFold
from statistics import mean
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.stats import loguniform
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [None]:
preprocessed_df = pd.read_csv("/work/preprocessed.csv", index_col=0)
preprocessed_df.head()

Unnamed: 0,char_id,guild,total_timestamps,unique_days,max_level,min_month,max_month,Average_Hour,Average_Playing_density,Playing_after_6_months
0,2,1,1,1,18,12,12,0.166667,0.032787,0
1,7,1,655,50,71,1,12,2.183333,0.136612,1
2,9,1,2739,133,70,1,6,3.432331,0.726776,1
3,10,0,674,49,61,7,11,2.292517,0.321311,0
4,19,1,3001,149,70,1,12,3.356823,0.407104,1


In [None]:
y = preprocessed_df['Playing_after_6_months']
preprocessed_df.drop(columns=['Playing_after_6_months', 'min_month', 'max_month', 'char_id'], inplace=True)
X = preprocessed_df
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)

In [None]:
# Implement Logistic Regression with regularization, KNN classifier and SVM


In [None]:
preprocessed_df.isnull().sum().sum()


0

In [None]:
preprocessed_df.columns

Index(['guild', 'total_timestamps', 'unique_days', 'max_level', 'Average_Hour',
       'Average_Playing_density'],
      dtype='object')

In [None]:
# pipeline for numerical transformer
numeric_features = ['guild','total_timestamps', 'unique_days', 'max_level', 'Average_Hour', 'Average_Playing_density']
numeric_transformer = Pipeline(steps = [('scaler', StandardScaler())])
# construct the column transformer
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

## Knn-Neighbors

In [None]:
# Standardize the training set using StandardScaler()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Test the accuracy of the KNN model on the standardized training set
neighbors = np.arange(1,15,2)
cross_val_scores =[]
skfold=StratifiedKFold(n_splits = 10, shuffle=False)

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors = i)
    scores = cross_val_score(knn,X_train_scaled,y_train,cv=skfold)
    cross_val_scores.append(np.mean(scores))
print(cross_val_scores)  

print("best cross-validation score:{:.3f}".format(np.max(cross_val_scores)))
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print("best n_neighbors:{}".format(best_n_neighbors))

knn_best = KNeighborsClassifier(n_neighbors = best_n_neighbors)
knn_best.fit(X_train,y_train)

print("test-set score:{:.3f}".format(knn_best.score(X_test,y_test)))

[0.9769765488566973, 0.9783331689686641, 0.9794396489884184, 0.9786186028697307, 0.9784044828488654, 0.978368819538041, 0.9779760516854259]
best cross-validation score:0.979
best n_neighbors:5
test-set score:0.899


## SVM

In [None]:
from sklearn.svm import LinearSVC
clf_svm = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('classifier', LinearSVC(max_iter=40000))
])

In [None]:
clf_svm

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['guild', 'total_timestamps',
                                                   'unique_days', 'max_level',
                                                   'Average_Hour',
                                                   'Average_Playing_density'])])),
                ('classifier', LinearSVC(max_iter=40000))])

In [None]:
param_grid_svm=[{'classifier': [LinearSVC()],
               'classifier__penalty': ['hinge'],
               'svc__C':[0.001,0.01,0.1,1,10,100]},
               ]



In [None]:
param_grid_svm

[{'classifier': [LinearSVC()],
  'classifier__penalty': ['hinge'],
  'svc__C': [0.001, 0.01, 0.1, 1, 10, 100]}]

In [None]:
grid_svm = GridSearchCV(clf_svm,param_grid_svm,cv=10)
grid_svm.fit(X_train,y_train)
grid_svm

ValueError: Invalid parameter svc for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['guild', 'total_timestamps',
                                                   'unique_days', 'max_level',
                                                   'Average_Hour',
                                                   'Average_Playing_density'])])),
                ('classifier', LinearSVC())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
estimator.get_param().keys()

NameError: name 'estimator' is not defined

In [None]:
linearSVC = LinearSVC(max_iter=4000)
linearSVC.fit(X_train,y_train)



LinearSVC(max_iter=4000)

## Logistic Regression

In [None]:
# assemble the end-to-end pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    #('classifier', KNeighborsClassifier(n_neighbors = 3))
    ('classifier', LogisticRegression())
])

In [None]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9920762394260627

In [None]:
param_grid = [{'classifier': [LogisticRegression()],
               'classifier__solver': ['lbfgs'],
               'classifier__penalty': ['l2'],
               'classifier__C': [0.001,0.01,0.1,1,10,100]},
               #'classifier__C': loguniform.rvs(1e-3, 1e0, size=100)},
              ]

In [None]:
#GridSearchCV(estimator, parameters to tune, cross validation fold)
grid_search = GridSearchCV(clf,param_grid,cv=10)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         ['guild',
                                                                          'total_timestamps',
                                                                          'unique_days',
                                                                          'max_level',
                                                                          'Average_Hour',
                                                                          'Average_Playing_density'])])),
                                       ('classifier', LogisticRegression())]),
     

In [None]:
grid_search.best_score_

1.0

In [None]:
grid_search.best_params_

{'classifier': LogisticRegression(C=100),
 'classifier__C': 100,
 'classifier__penalty': 'l2',
 'classifier__solver': 'lbfgs'}

In [None]:
grid_search.score(X_test,y_test)

1.0

# example from class

In [None]:
# pipeline for numerical transformer
numeric_features = ['age','fare']
numeric_transformer = Pipeline(steps = 
                              [('imputer', SimpleImputer(strategy='median')), 
                               ('scaler', StandardScaler())]
                              )

# pipeline for categorical transformer
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps = 
                              [('imputer', SimpleImputer(strategy='most_frequent', missing_values='nan')), 
                               ('onehot', OneHotEncoder(handle_unknown='ignore'))]
                              )

NameError: name 'Pipeline' is not defined

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features), 
    ('cat', categorical_transformer, categorical_features)
])

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('classifier', KNeighborsClassifier(n_neighbors = 3))
])

In [None]:
# because its a classification problem, use stratify
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, test_size=0.2)

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
# hyperparameter can ba the imputer strategy (median, mean, ...) or the nb of neighbors
# __ are accessor
param_grid = {'classifier__n_neighbors':np.arange(1,30,2), 
              'preprocessor__num__imputer__strategy':['mean', 'median']}

In [None]:
#GridSearchCV(estimator, parameters to tune, cross validation fold)
grid_search = GridSearchCV(clf,param_grid,cv=10)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic_l1_l2_sparsity.html

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=3936dca8-42ca-41a4-aad6-22f3c31c57fb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>