<a href="https://colab.research.google.com/github/stevanmatovic/serbian-sentiment-analysis/blob/master/logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from google.colab import drive
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_files
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

In [8]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = pd.read_csv('/content/drive/My Drive/final.csv')
dfu = pd.read_csv('/content/drive/My Drive/final_upweigting.csv')

In [0]:
def log_reg(x_train,y_train,x_test,y_test):
  param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],'solver':['lbfgs', 'liblinear', 'sag', 'saga']}
  grid = GridSearchCV(LogisticRegression(max_iter=3000), param_grid, cv=5)
  grid.fit(x_train, y_train)
  print("Best cross-validation score: {:.2f}".format(grid.best_score_))
  print("Best parameters: ", grid.best_params_)
  print("Best estimator: ", grid.best_estimator_)
  lr = grid.best_estimator_
  y_pred = lr.predict(x_test)
  print(classification_report(y_test, y_pred))

In [17]:
for dataset in [df,dfu]:
    if not dataset.equals(dfu):
        display(Markdown('<h1>Dataset without upweighting:</h1>'))
    else:
        display(Markdown('<h1>Dataset with upweighting:</h1>'))
    df_x = dataset['Review']
    df_y = dataset['Label']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
    cv_u = CountVectorizer()
    cv_b = CountVectorizer(ngram_range=(2, 2))
    tv = TfidfVectorizer()

    unigram_train = cv_u.fit_transform(x_train)
    unigram_test = cv_u.transform(x_test)
    print("Number of features - unigram: {}".format(len(cv_u.get_feature_names())))

    bigram_train = cv_b.fit_transform(x_train)
    bigram_test = cv_b.transform(x_test)
    print("Number of features - bigram: {}".format(len(cv_b.get_feature_names())))

    tfidf_train = tv.fit_transform(x_train)
    tfidf_test = tv.transform(x_test)
    print("Number of features - tfidf: {}".format(len(tv.get_feature_names())))
    
    display(Markdown('<h2>Logistic Regression(unigram vectorization):</h2>'))
    log_reg(unigram_train,y_train,unigram_test,y_test)
    
    
    display(Markdown('<h2>Logistic Regression(bigram vectorization):</h2>'))
    log_reg(bigram_train,y_train,bigram_test,y_test)
    
    display(Markdown('<h2>Logistic Regression(tfidf vectorization):</h2>'))
    log_reg(tfidf_train,y_train,tfidf_test,y_test)

<h1>Dataset without upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 251583
Number of features - tfidf: 21325


<h2>Logistic Regression(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 1, 'solver': 'sag'}
Best estimator:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.77      0.61      0.68      1536
           1       0.95      0.98      0.97     12343

   micro avg       0.94      0.94      0.94     13879
   macro avg       0.86      0.79      0.82     13879
weighted avg       0.93      0.94      0.93     13879



<h2>Logistic Regression(bigram vectorization):</h2>



Best cross-validation score: 0.91
Best parameters:  {'C': 10, 'solver': 'lbfgs'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.73      0.35      0.47      1536
           1       0.92      0.98      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.83      0.67      0.71     13879
weighted avg       0.90      0.91      0.90     13879



<h2>Logistic Regression(tfidf vectorization):</h2>

Best cross-validation score: 0.93
Best parameters:  {'C': 10, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.79      0.82     13879
weighted avg       0.93      0.93      0.93     13879



<h1>Dataset with upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 259453
Number of features - tfidf: 21325


<h2>Logistic Regression(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 1, 'solver': 'saga'}
Best estimator:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.77      0.63      0.69      1536
           1       0.96      0.98      0.97     12343

   micro avg       0.94      0.94      0.94     13879
   macro avg       0.86      0.80      0.83     13879
weighted avg       0.93      0.94      0.94     13879



<h2>Logistic Regression(bigram vectorization):</h2>



Best cross-validation score: 0.92
Best parameters:  {'C': 10, 'solver': 'lbfgs'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.43      0.54      1536
           1       0.93      0.98      0.96     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.84      0.71      0.75     13879
weighted avg       0.91      0.92      0.91     13879



<h2>Logistic Regression(tfidf vectorization):</h2>

Best cross-validation score: 0.93
Best parameters:  {'C': 10, 'solver': 'saga'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.79      0.81     13879
weighted avg       0.93      0.93      0.93     13879

