# Multinomial Logistic Regression

In [1]:
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Dataframe
path_df = "Pickles/df.pickle"
with open(path_df, 'rb') as data:
    df = pickle.load(data)

# features_train
path_features_train = "Pickles/features_train.pickle"
with open(path_features_train, 'rb') as data:
    features_train = pickle.load(data)

# labels_train
path_labels_train = "Pickles/labels_train.pickle"
with open(path_labels_train, 'rb') as data:
    labels_train = pickle.load(data)

# features_test
path_features_test = "Pickles/features_test.pickle"
with open(path_features_test, 'rb') as data:
    features_test = pickle.load(data)

# labels_test
path_labels_test = "Pickles/labels_test.pickle"
with open(path_labels_test, 'rb') as data:
    labels_test = pickle.load(data)

In [3]:
print(features_train.shape)
print(features_test.shape)

(2041, 300)
(876, 300)


## Ajustes de parametros com Cross-Validation

In [4]:
lr_0 = LogisticRegression(random_state = 10)

print('Parameters currently in use:\n')
pprint(lr_0.get_params())

Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 10,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


### Randomized Search Cross Validation

In [5]:
# C
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]

# multi_class
multi_class = ['multinomial']

# solver
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
 
# class_weight
class_weight = ['balanced', None]

# penalty
penalty = ['l2']

# Create the random grid
random_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

pprint(random_grid)

{'C': [0.1,
       0.2,
       0.30000000000000004,
       0.4,
       0.5,
       0.6,
       0.7000000000000001,
       0.8,
       0.9,
       1.0],
 'class_weight': ['balanced', None],
 'multi_class': ['multinomial'],
 'penalty': ['l2'],
 'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}


In [6]:
lrc = LogisticRegression(random_state=10)

random_search = RandomizedSearchCV(estimator=lrc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=5, 
                                   verbose=1, 
                                   random_state=10)

random_search.fit(features_train, labels_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5, estimator=LogisticRegression(random_state=10),
                   n_iter=50,
                   param_distributions={'C': [0.1, 0.2, 0.30000000000000004,
                                              0.4, 0.5, 0.6, 0.7000000000000001,
                                              0.8, 0.9, 1.0],
                                        'class_weight': ['balanced', None],
                                        'multi_class': ['multinomial'],
                                        'penalty': ['l2'],
                                        'solver': ['newton-cg', 'sag', 'saga',
                                                   'lbfgs']},
                   random_state=10, scoring='accuracy', verbose=1)

In [7]:
print(random_search.best_params_)
print("")
print(random_search.best_score_)

{'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'multinomial', 'class_weight': 'balanced', 'C': 0.1}

0.6339973153075411


### Grid Search Cross Validation

In [8]:
C = [float(x) for x in np.linspace(start = 0.3, stop = 1, num = 10)]
multi_class = ['multinomial']
solver = ['lbfgs']
class_weight = ['balanced']
penalty = ['l2']

param_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

lrc = LogisticRegression(random_state=10)

cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 10)

grid_search = GridSearchCV(estimator=lrc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=10, test_size=0.33, train_size=None),
             estimator=LogisticRegression(random_state=10),
             param_grid={'C': [0.3, 0.37777777777777777, 0.45555555555555555,
                               0.5333333333333333, 0.6111111111111112,
                               0.6888888888888889, 0.7666666666666666,
                               0.8444444444444446, 0.9222222222222223, 1.0],
                         'class_weight': ['balanced'],
                         'multi_class': ['multinomial'], 'penalty': ['l2'],
                         'solver': ['lbfgs']},
             scoring='accuracy', verbose=1)

In [9]:
print(grid_search.best_params_)
print("")
print(grid_search.best_score_)

{'C': 0.8444444444444446, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'lbfgs'}

0.6201780415430268


In [10]:
best_lrc = grid_search.best_estimator_

best_lrc

LogisticRegression(C=0.8444444444444446, class_weight='balanced',
                   multi_class='multinomial', random_state=10)

## Treinamento do modelo e avaliação

In [11]:
best_lrc.fit(features_train, labels_train)

LogisticRegression(C=0.8444444444444446, class_weight='balanced',
                   multi_class='multinomial', random_state=10)

In [12]:
lrc_pred = best_lrc.predict(features_test)

#### Acurácia no treino

In [13]:
print(accuracy_score(labels_train, best_lrc.predict(features_train)))

0.7114159725624694


#### Acurácia no teste

In [14]:
print(accuracy_score(labels_test, lrc_pred))

0.6415525114155252


#### Classificação

In [15]:
print(classification_report(labels_test,lrc_pred))

              precision    recall  f1-score   support

           0       0.63      0.59      0.61       291
           1       0.65      0.72      0.68       289
           2       0.64      0.61      0.63       296

    accuracy                           0.64       876
   macro avg       0.64      0.64      0.64       876
weighted avg       0.64      0.64      0.64       876



#### Comparando

In [16]:
base_model = LogisticRegression(random_state = 10)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

0.639269406392694

In [17]:
best_lrc.fit(features_train, labels_train)
accuracy_score(labels_test, best_lrc.predict(features_test))

0.6415525114155252

In [18]:
d = {
     'Model': 'Logistic Regression',
     'Training Set Accuracy': accuracy_score(labels_train, best_lrc.predict(features_train)),
     'Test Set Accuracy': accuracy_score(labels_test, lrc_pred)
}

df_models_lrc = pd.DataFrame(d, index=[0])

In [19]:
df_models_lrc

Unnamed: 0,Model,Training Set Accuracy,Test Set Accuracy
0,Logistic Regression,0.711416,0.641553


In [20]:
with open('Models/best_lrc.pickle', 'wb') as output:
    pickle.dump(best_lrc, output)
    
with open('Models/df_models_lrc.pickle', 'wb') as output:
    pickle.dump(df_models_lrc, output)