<a href="https://colab.research.google.com/github/stevanmatovic/serbian-sentiment-analysis/blob/master/logistic_regression_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from google.colab import drive
from IPython.display import Markdown, display
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt

In [16]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = pd.read_csv('/content/drive/My Drive/final.csv')
dfu = pd.read_csv('/content/drive/My Drive/final_upweigting.csv')

In [0]:
def log_reg(x_train,y_train,x_test,y_test):
  param_grid = {'C': [0.01, 0.1, 1, 10, 100],'solver':['lbfgs', 'liblinear', 'sag', 'saga']}
  grid = GridSearchCV(LogisticRegression(max_iter=100), param_grid, cv=5)
  grid.fit(x_train, y_train)
  print("Best cross-validation score: {:.2f}".format(grid.best_score_))
  print("Best parameters: ", grid.best_params_)
  print("Best estimator: ", grid.best_estimator_)
  lr = grid.best_estimator_
  y_pred = lr.predict(x_test)
  print(classification_report(y_test, y_pred))

In [19]:
for dataset in [df,dfu]:
    if not dataset.equals(dfu):
        display(Markdown('<h1>Dataset without upweighting:</h1>'))
    else:
        display(Markdown('<h1>Dataset with upweighting:</h1>'))
    df_x = dataset['Review']
    df_y = dataset['Label']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
    cv_u = CountVectorizer()
    cv_b = CountVectorizer(ngram_range=(2, 2))
    cv_t = CountVectorizer(ngram_range=(3, 3))

    tv = TfidfVectorizer()

    unigram_train = cv_u.fit_transform(x_train)
    unigram_test = cv_u.transform(x_test)
    print("Number of features - unigram: {}".format(len(cv_u.get_feature_names())))

    bigram_train = cv_b.fit_transform(x_train)
    bigram_test = cv_b.transform(x_test)
    print("Number of features - bigram: {}".format(len(cv_b.get_feature_names())))

    trigram_train = cv_t.fit_transform(x_train)
    trigram_test = cv_t.transform(x_test)
    print("Number of features - trigram: {}".format(len(cv_t.get_feature_names())))

    
    tfidf_train = tv.fit_transform(x_train)
    tfidf_test = tv.transform(x_test)
    print("Number of features - tfidf: {}".format(len(tv.get_feature_names())))
    
    display(Markdown('<h2>Logistic Regression(unigram vectorization):</h2>'))
    log_reg(unigram_train,y_train,unigram_test,y_test)
    
    display(Markdown('<h2>Logistic Regression(bigram vectorization):</h2>'))
    log_reg(bigram_train,y_train,bigram_test,y_test)
    
    display(Markdown('<h2>Logistic Regression(trigram vectorization):</h2>'))
    log_reg(trigram_train,y_train,trigram_test,y_test)
    
    display(Markdown('<h2>Logistic Regression(tfidf vectorization):</h2>'))
    log_reg(tfidf_train,y_train,tfidf_test,y_test)

<h1>Dataset without upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 251583
Number of features - trigram: 434333
Number of features - tfidf: 21325


<h2>Logistic Regression(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 1, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.75      0.60      0.66      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.79      0.81     13879
weighted avg       0.93      0.93      0.93     13879



<h2>Logistic Regression(bigram vectorization):</h2>



Best cross-validation score: 0.91
Best parameters:  {'C': 10, 'solver': 'lbfgs'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.73      0.35      0.48      1536
           1       0.92      0.98      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.83      0.67      0.71     13879
weighted avg       0.90      0.91      0.90     13879





<h2>Logistic Regression(trigram vectorization):</h2>



Best cross-validation score: 0.89
Best parameters:  {'C': 100, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.67      0.08      0.14      1536
           1       0.90      1.00      0.94     12343

   micro avg       0.89      0.89      0.89     13879
   macro avg       0.79      0.54      0.54     13879
weighted avg       0.87      0.89      0.85     13879



<h2>Logistic Regression(tfidf vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 10, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.79      0.82     13879
weighted avg       0.93      0.93      0.93     13879



<h1>Dataset with upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 259453
Number of features - trigram: 468529
Number of features - tfidf: 21325


<h2>Logistic Regression(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 0.1, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      1536
           1       0.95      0.98      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.87      0.78      0.81     13879
weighted avg       0.93      0.93      0.93     13879



<h2>Logistic Regression(bigram vectorization):</h2>



Best cross-validation score: 0.92
Best parameters:  {'C': 10, 'solver': 'lbfgs'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.43      0.55      1536
           1       0.93      0.98      0.96     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.84      0.71      0.75     13879
weighted avg       0.91      0.92      0.91     13879





<h2>Logistic Regression(trigram vectorization):</h2>



Best cross-validation score: 0.89
Best parameters:  {'C': 100, 'solver': 'liblinear'}
Best estimator:  LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.70      0.11      0.20      1536
           1       0.90      0.99      0.94     12343

   micro avg       0.90      0.90      0.90     13879
   macro avg       0.80      0.55      0.57     13879
weighted avg       0.88      0.90      0.86     13879



<h2>Logistic Regression(tfidf vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 10, 'solver': 'saga'}
Best estimator:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.79      0.81     13879
weighted avg       0.93      0.93      0.93     13879



In [0]:
def svm(x_train,y_train,x_test,y_test):
  param_grid = {'C': [0.01, 0.1, 1, 10]}
  grid = GridSearchCV(LinearSVC(max_iter=1000), param_grid, cv=5)
  grid.fit(x_train, y_train)
  print("Best cross-validation score: {:.2f}".format(grid.best_score_))
  print("Best parameters: ", grid.best_params_)
  print("Best estimator: ", grid.best_estimator_)
  lr = grid.best_estimator_
  y_pred = lr.predict(x_test)
  print(classification_report(y_test, y_pred))

In [22]:
for dataset in [df,dfu]:
    if not dataset.equals(dfu):
        display(Markdown('<h1>Dataset without upweighting:</h1>'))
    else:
        display(Markdown('<h1>Dataset with upweighting:</h1>'))
    df_x = dataset['Review']
    df_y = dataset['Label']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)
    cv_u = CountVectorizer()
    cv_b = CountVectorizer(ngram_range=(2, 2))
    cv_t = CountVectorizer(ngram_range=(3, 3))
    tv = TfidfVectorizer()

    unigram_train = cv_u.fit_transform(x_train)
    unigram_test = cv_u.transform(x_test)
    print("Number of features - unigram: {}".format(len(cv_u.get_feature_names())))

    bigram_train = cv_b.fit_transform(x_train)
    bigram_test = cv_b.transform(x_test)
    print("Number of features - bigram: {}".format(len(cv_b.get_feature_names())))
    
    trigram_train = cv_t.fit_transform(x_train)
    trigram_test = cv_t.transform(x_test)
    print("Number of features - trigram: {}".format(len(cv_t.get_feature_names())))

    tfidf_train = tv.fit_transform(x_train)
    tfidf_test = tv.transform(x_test)
    print("Number of features - tfidf: {}".format(len(tv.get_feature_names())))
    
    display(Markdown('<h2>SVM classifier(unigram vectorization):</h2>'))
    svm(unigram_train,y_train,unigram_test,y_test)
    
    display(Markdown('<h2>Linear SVM classifier(bigram vectorization):</h2>'))
    svm(bigram_train,y_train,bigram_test,y_test)
    
    display(Markdown('<h2>SVM classifier(trigram vectorization):</h2>'))
    svm(trigram_train,y_train,trigram_test,y_test)
    
    display(Markdown('<h2>Linear SVM classifier(tfidf vectorization):</h2>'))
    svm(tfidf_train,y_train,tfidf_test,y_test)

<h1>Dataset without upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 251583
Number of features - trigram: 434333
Number of features - tfidf: 21325


<h2>SVM classifier(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 0.1}
Best estimator:  LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.75      0.59      0.66      1536
           1       0.95      0.98      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.85      0.78      0.81     13879
weighted avg       0.93      0.93      0.93     13879



<h2>Linear SVM classifier(bigram vectorization):</h2>



Best cross-validation score: 0.91
Best parameters:  {'C': 0.1}
Best estimator:  LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.75      0.31      0.43      1536
           1       0.92      0.99      0.95     12343

   micro avg       0.91      0.91      0.91     13879
   macro avg       0.83      0.65      0.69     13879
weighted avg       0.90      0.91      0.89     13879



<h2>SVM classifier(trigram vectorization):</h2>



Best cross-validation score: 0.89
Best parameters:  {'C': 1}
Best estimator:  LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.64      0.06      0.12      1536
           1       0.90      1.00      0.94     12343

   micro avg       0.89      0.89      0.89     13879
   macro avg       0.77      0.53      0.53     13879
weighted avg       0.87      0.89      0.85     13879



<h2>Linear SVM classifier(tfidf vectorization):</h2>

Best cross-validation score: 0.93
Best parameters:  {'C': 1}
Best estimator:  LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.72      0.60      0.66      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.84      0.79      0.81     13879
weighted avg       0.93      0.93      0.93     13879



<h1>Dataset with upweighting:</h1>

Number of features - unigram: 21325
Number of features - bigram: 259453
Number of features - trigram: 468529
Number of features - tfidf: 21325


<h2>SVM classifier(unigram vectorization):</h2>



Best cross-validation score: 0.93
Best parameters:  {'C': 0.1}
Best estimator:  LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.74      0.59      0.65      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.84      0.78      0.81     13879
weighted avg       0.93      0.93      0.93     13879



<h2>Linear SVM classifier(bigram vectorization):</h2>



Best cross-validation score: 0.92
Best parameters:  {'C': 0.1}
Best estimator:  LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.74      0.39      0.51      1536
           1       0.93      0.98      0.95     12343

   micro avg       0.92      0.92      0.92     13879
   macro avg       0.84      0.69      0.73     13879
weighted avg       0.91      0.92      0.91     13879



<h2>SVM classifier(trigram vectorization):</h2>



Best cross-validation score: 0.89
Best parameters:  {'C': 1}
Best estimator:  LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.70      0.11      0.19      1536
           1       0.90      0.99      0.94     12343

   micro avg       0.90      0.90      0.90     13879
   macro avg       0.80      0.55      0.57     13879
weighted avg       0.88      0.90      0.86     13879



<h2>Linear SVM classifier(tfidf vectorization):</h2>

Best cross-validation score: 0.93
Best parameters:  {'C': 1}
Best estimator:  LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
              precision    recall  f1-score   support

           0       0.74      0.60      0.66      1536
           1       0.95      0.97      0.96     12343

   micro avg       0.93      0.93      0.93     13879
   macro avg       0.84      0.79      0.81     13879
weighted avg       0.93      0.93      0.93     13879

