In [None]:
!pip install langdetect

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from __future__ import print_function

import re
import sklearn
import sklearn.ensemble
import sklearn.metrics
from sklearn.utils import shuffle
from io import StringIO
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from langdetect import DetectorFactory, detect

import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer
import eli5

In [None]:
sns.set_style("darkgrid")
pd.set_option('mode.chained_assignment', None)
SEED = 42

DetectorFactory.seed = SEED

In [None]:
data = pd.read_csv("/kaggle/input/indeed/indeed_review_241120.csv")
data = data[["Review Raw", "Rating"]]
data = data.rename(columns = {"Review Raw": "Review"})
data["lang"] = data.Review.apply(detect)
data = data[data.lang == "en"]
data.drop("lang", axis = 1, inplace = True)

In [None]:
data.head()

In [None]:
sns.countplot(x = "Rating", data = data)
plt.show()

In [None]:
data["Rating"] = data.Rating.apply(lambda x: 4 if x == 5 else 3 if x == 4 else 2 if x == 3 else 1)

In [None]:
# Creating train-test Split
X = data.drop("Rating", axis = 1)
y = data.Rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)

In [None]:
# fitting the classifier
vec = CountVectorizer()
clf = LogisticRegression(max_iter = 500)
pipe = make_pipeline(vec, clf)
pipe.fit(X_train.Review, y_train)

In [None]:
def print_report(pipe):
    y_actuals = y_test
    y_preds = pipe.predict(X_test.Review)
    report = metrics.classification_report(y_actuals, y_preds)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_actuals, y_preds)))

In [None]:
print_report(pipe)

In [None]:
for i, tag in enumerate(clf.classes_):
    coefficients = clf.coef_[i]
    weights = list(zip(vec.get_feature_names(),coefficients))
    print('Tag:',tag)
    print('Most Positive Coefficients:')
    print(sorted(weights,key=lambda x: -x[1])[:10])
    print('Most Negative Coefficients:')
    print(sorted(weights,key=lambda x: x[1])[:10])
    print("--------------------------------------")

In [None]:
eli5.show_weights(clf, vec = vec, top = 20)

# Base model 

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
    
data['Review'] = data.Review.apply(clean_text)

list_corpus = data.Review.tolist()
list_labels = data.Rating.tolist()
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size = 0.3, random_state=42)
vectorizer = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}', stop_words = 'english', binary=True)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

logreg = LogisticRegression(n_jobs=1, max_iter = 2000)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
f1 = f1_score(y_test, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

In [None]:
eli5.show_weights(logreg, vec = vectorizer, top = 20)