In [2]:
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install --disable-pip-version-check --progress-bar off -q https://github.com/pandas-profiling/pandas-profiling/archive/master.zip



In [3]:
seed = 161

import pandas as pd
import warnings; warnings.simplefilter('ignore')
import nltk

from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.pipeline import Pipeline

from joblib import dump, load


In [4]:
# Descargando las stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/andres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/andres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.utils import resample



In [6]:
# Se cargan los datos. 

file_name = 'reviews_train_val.csv'
reviews_df = pd.read_csv('data/reviews_train_val.csv', sep = ',')

neg_class_resampled = resample(
    reviews_df, replace=False, n_samples=2000,
    random_state=1234,
)
reviews_df = neg_class_resampled.reset_index(drop=True)

In [7]:
# Se filtran los que no tienen etiqueta
reviews_labeled_df = reviews_df.loc[reviews_df['stars'].notnull()]
reviews_labeled_df.shape

(2000, 2)

In [8]:


X_train, X_test, y_train, y_test = train_test_split(reviews_labeled_df['text'], reviews_labeled_df['stars'], test_size = 0.2, stratify = reviews_labeled_df['stars'], random_state = 1)

X_train.shape

(1600,)

In [9]:
from Processor.TextProcessor import TextProcesser


estimators = [
    ("processer", TextProcesser()),
    ("vect", CountVectorizer(max_df = 0.5, min_df = 0.001)),
]
processing_pipeline = Pipeline(estimators)


In [10]:
X_train = processing_pipeline.fit_transform(X_train).toarray()
X_test = processing_pipeline.transform(X_test).toarray()

In [11]:
dump(processing_pipeline, "assets/pipeline.joblib")

['assets/pipeline.joblib']

In [12]:
from sklearn.linear_model import LogisticRegression

best_model = LogisticRegression(C=50,
 multi_class = 'auto',
 penalty= 'l2',
 solver= 'newton-cg',
 )


In [13]:
best_model.fit(X_train, y_train)

LogisticRegression(C=50, solver='newton-cg')

In [14]:

dump(best_model, "assets/model.joblib")

['assets/model.joblib']

In [15]:
preds_train = best_model.predict(X_train)
preds_test = best_model.predict(X_test)

In [16]:
from sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum()
    avg[-1] = total

    class_report_df['avg'] = avg

    return class_report_df.T

df_class_report = pandas_classification_report(y_test, preds_test)

df_class_report.to_json("assets/report.json")


In [17]:
vocabulary = processing_pipeline.steps[1][1].vocabulary_

coef = pd.DataFrame(best_model.coef_)
coef.columns = vocabulary

coef.to_json("assets/coefficients.json")


In [18]:
coef.head()


Unnamed: 0,cal,appoint,show,job,nee,don,took,12,week,return,...,zen,antique,21st,drag,generously,potatoe,ribey,ribeye,remak,1215
0,0.499474,-0.124017,0.529845,0.019718,-0.110217,-0.097936,0.085162,-0.013951,0.052404,-0.021919,...,-0.005244,-0.014394,-0.138434,-0.01536,-0.021067,0.023912,-0.014336,-0.013435,0.04986341,0.04986341
1,-0.371662,-0.260572,-0.019276,-0.165761,0.148351,0.119268,-0.08205,-0.083319,-0.010411,0.037686,...,-0.003946,-0.01831,-0.159139,-0.031264,-0.075086,0.141798,0.067785,0.06757,-0.02044468,-0.02044468
2,-0.477261,-0.000295,-0.239518,-0.052297,-0.020076,-0.064163,-0.164466,0.157978,-0.000213,0.089819,...,-0.139776,-0.078317,-0.141547,0.175743,-0.218274,-0.04632,-0.027091,-0.027093,-0.05918417,-0.05918417
3,0.35696,0.19448,-0.278374,-0.157482,-0.016475,-0.033787,0.231062,-0.047651,-4.4e-05,-0.056807,...,0.291916,0.243979,0.340494,-0.182297,0.649101,-0.150428,-0.002463,-0.002463,0.02976568,0.02976568
4,-0.007511,0.190405,0.007322,0.355821,-0.001583,0.076618,-0.069709,-0.013057,-0.041735,-0.048779,...,-0.14295,-0.132959,0.098626,0.053178,-0.334675,0.031037,-0.023895,-0.02458,-2.289256e-07,-2.289256e-07


In [19]:
from sklearn.metrics import confusion_matrix

cm_test = confusion_matrix(y_test, preds_test, labels = best_model.classes_)
cm_test_norm = confusion_matrix(y_test, preds_test, labels = best_model.classes_, normalize = 'all')

print(cm_test)

[[ 41   2   4   3  10]
 [  8   3   8   8   6]
 [  8   5   8  13   9]
 [  4   3  12  33  36]
 [  4   2   6  34 130]]
