# Use of Logistic Regression

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [2]:
data = pd.read_csv('./datasets/final.csv')

In [3]:
data.head()

Unnamed: 0,comments,label
0,[Baldinger] .@49ers here is my opening script ...,1
1,[49ers on NBCS] .@frankgore is loving what he’...,1
2,Chiefs fans be like,1
3,deal of the day,1
4,This guy made a really complex Python simulati...,1


In [4]:
X = data['comments']
y = data['label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

In [6]:
pipe = Pipeline([
    
    #('tfidf',TfidfVectorizer()),
   ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression()),
   
])

In [19]:
grid_param = {
    'cvec__ngram_range' : [(1,1),(1,2),(1,3)],
    'cvec__max_features': [300,None],
    'cvec__min_df'      : [2, 3],
    'cvec__max_df'      : [.95,.05],
    'cvec__stop_words'  : [stop_words]
}

grid = GridSearchCV(pipe,
                    grid_param,
                    n_jobs = -1, # -1 means using all processors
                    verbose = 2)

In [20]:
grid.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    7.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'cvec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'cvec__max_features': [300, None], 'cvec__min_df': [2, 3], 'cvec__max_df': [0.95, 0.05], 'cvec__stop_words': [{'d', 'whom', 'before', "you'll", 'at', "should've", 'don', 'won', "don't", 'ourselves', 'its', "hadn't", 'was', 'on', 'any', 'some..., 'needn', 'y', 'why', 'about', 'yourself', 'than', 'an', 'his', 'from', 'this', "shan't", 'were'}]},
       pre_dispatch='2*n_jobs', re

In [9]:
print(f'The grid best_score_ is: {round(grid.best_score_,4)}') #taken from lesson 4.06

The grid best_score_ is: 0.762


In [10]:
grid.score(X_train,y_train)

0.9578877005347594

In [11]:
grid.score(X_test,y_test)

0.7915831663326653

## Confusion matrix results:

In [12]:
preds = grid.predict(X_test)

In [13]:
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [14]:
 balanced_accuracy_score(y_test, preds)

0.7912864670350854

In [15]:
def total_metrics(insta_model,X_test, y_test):
    preds = insta_model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn+fp)
    precision =  tp / (tp + fp)
    bas = balanced_accuracy_score(y_test,preds)
    
    print("\u0332".join("RESULTS OF COUNT-VECTORIZER / LINEAR REGRESSION MODEL "))
    print('')
    print(f"          Accuracy: {round(insta_model.best_score_,3)}")
    print('')
    print(f" Balance Accuracy: {round(bas,3)}")
    print('')
    print(f"       Sensitivity: {round(sensitivity,3)}")
    print('')
    print(f"       Specificity: {round(specificity,3)}")
    print('')
    print(f"         Precision: {round(precision,3)}")
    print('')    
    
    

In [16]:
total_metrics(grid,X_test,y_test)

R̲E̲S̲U̲L̲T̲S̲ ̲O̲F̲ ̲C̲O̲U̲N̲T̲-̲V̲E̲C̲T̲O̲R̲I̲Z̲E̲R̲ ̲/̲ ̲L̲I̲N̲E̲A̲R̲ ̲R̲E̲G̲R̲E̲S̲S̲I̲O̲N̲ ̲M̲O̲D̲E̲L̲ 

          Accuracy: 0.762

 Balance Accuracy: 0.791

       Sensitivity: 0.841

       Specificity: 0.742

         Precision: 0.767



In [17]:
grid.score(X_train,y_train)

0.9578877005347594

In [18]:
grid.score(X_test,y_test)

0.7915831663326653