In [None]:
import warnings

warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

from LinearModel_exercise_pipes import *

# Linear Regression

In [None]:
file=r'/Users/lalitsachan/Dropbox/PDSV4/4. Linear Models/facebook_comments.csv'

In [None]:
fb=pd.read_csv(file)

In [None]:
fb.head()

In [None]:
fb.info()

In [None]:
cyclic_feat=['Post Published Weekday','Base Date Time Weekday']
cat_feat=['page_category']
target=['Comments_in_next_H_hrs']
num_feat=[_ for _ in fb.columns if _ not in cyclic_feat+cat_feat+target]

In [None]:
p1=pdPipeline([
    ('select_cyclic',VarSelector(cyclic_feat)),
    ('cyclic_feat',custom_cyclic())
])
p2=pdPipeline([
    ('select_cat',VarSelector(cat_feat)),
    ('missing_treat',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(200))
])
p3=pdPipeline([
    ('select_num',VarSelector(num_feat)),
    ('missing_treat',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('p1',p1),
    ('p2',p2),
    ('p3',p3)
])

In [None]:
data_pipe.fit(fb)

In [None]:
x_train=pd.DataFrame(data=data_pipe.transform(fb),
                    columns=data_pipe.get_feature_names())
y_train=fb[target]

In [None]:
x_train.shape

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr=LinearRegression()

In [None]:
cvmae=-np.array(cross_val_score(lr,x_train,y_train,cv=10,
                      scoring='neg_mean_absolute_error',
                     n_jobs=-1))

In [None]:
cvmae.mean()

In [None]:
cvmae.std()

In [None]:
from sklearn.linear_model import Lasso,Ridge
from sklearn.model_selection import GridSearchCV

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.6f} (std: {1:.6f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
### lasso
# this will take longer time to finish in comparison to ridge because of not having a closed form solution

model=Lasso(fit_intercept=True)
params ={'alpha': np.linspace(1,100,100)}
gs=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)
gs.fit(x_train,y_train)

In [None]:
report(gs.cv_results_,5)

In [None]:
lasso_model=gs.best_estimator_

In [None]:
lasso_model.fit(x_train,y_train)

In [None]:
list(zip(data_pipe.get_feature_names(),lasso_model.coef_))



In [None]:
(lasso_model.coef_==0).sum()



In [None]:
### ridge

model=Ridge(fit_intercept=True)
params ={'alpha': np.linspace(1e7,1e9,100)}
gs=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring='neg_mean_absolute_error',
                        verbose=20,n_jobs=-1)
gs.fit(x_train,y_train)

In [None]:
report(gs.cv_results_,5)

# Logistic Regression

In [None]:
file=r'/Users/lalitsachan/Dropbox/PDSV4/4. Linear Models/default of credit card clients.xls'
dd=pd.read_excel(file,skiprows=1)

In [None]:
dd.head()

In [None]:
dd.columns

In [None]:
dd.shape

In [None]:
dd.info()

In [None]:
cat_feat=['SEX', 'EDUCATION', 'MARRIAGE']
target=['default payment next month']
num_feat=[_ for _ in dd.columns if _ not in cat_feat+target+['ID']]

In [None]:
p1=pdPipeline([
    ('select_cat',VarSelector(cat_feat)),
    ('missing_treat',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(300))
])
p2=pdPipeline([
    ('select_num',VarSelector(num_feat)),
    ('missing_treat',DataFrameImputer())
])

data_pipe=FeatureUnion([
    ('p1',p1),
    ('p2',p2)
])

In [None]:
data_pipe.fit(dd)

In [None]:
x_train=pd.DataFrame(data=data_pipe.transform(dd),
                    columns=data_pipe.get_feature_names())
y_train=dd[target]

In [None]:
x_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
params={'class_weight':['balanced',None],
        'penalty':['l1','l2'],
        'C':[.0001,.0005,.001,.005,.01,.05,.1,1,2,5]}

In [None]:
model=LogisticRegression(fit_intercept=True)

In [None]:
grid_search=GridSearchCV(model,
                         param_grid=params,
                         cv=10,
                         scoring="roc_auc",
                         n_jobs=-1,
                         verbose=20)

In [None]:
grid_search.fit(x_train,y_train)

In [None]:
report(grid_search.cv_results_,5)

In [None]:
logr=grid_search.best_estimator_

In [None]:
logr.fit(x_train,y_train)

In [None]:
train_score=logr.predict_proba(x_train)[:,1]
real=y_train

In [None]:
from sklearn.metrics import fbeta_score

In [None]:
fbeta_all=[]
cutoffs=np.linspace(0.01,0.99,99)
for cutoff in cutoffs:
    
    predicted=(train_score>cutoff).astype(int)

       
    fbeta_all.append(fbeta_score(real,predicted,beta=2))

In [None]:
mycutoff=cutoffs[fbeta_all==max(fbeta_all)]
mycutoff