In [41]:
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install --disable-pip-version-check --progress-bar off -q https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
!pip install -U pandas-profiling



In [42]:
seed = 161

import pandas as pd
import warnings; warnings.simplefilter('ignore')
import nltk

from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.pipeline import Pipeline

from joblib import dump, load


In [43]:
# Descargando las stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/andres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/andres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
from sklearn.utils import resample



In [45]:
# Se cargan los datos. 

file_name = 'reviews_train_val.csv'
reviews_df = pd.read_csv('data/reviews_train_val.csv', sep = ',')

neg_class_resampled = resample(
    reviews_df, replace=False, n_samples=5000,
    random_state=1234,
)
reviews_df = neg_class_resampled.reset_index(drop=True)

In [46]:
import json
from pandas_profiling import ProfileReport
ProfileReport(reviews_df, title='Profiling Report', explorative=True)

profile = reviews_df.profile_report()
p = profile.to_json()
y = json.loads(p)

stars = reviews_df.describe()



my_profile = {"count": y['table']['n'], "n_var" : y['table']["n_var"],
              "n_cells_missing": y['table']["n_cells_missing"],
"n_vars_with_missing": y['table']["n_vars_with_missing"],
"n_vars_all_missing": y['table']["n_vars_all_missing"],
"p_cells_missing": y['table']["p_cells_missing"],
"n_duplicates": y['table']["n_duplicates"],
"p_duplicates": y['table']["p_duplicates"],

              "stars": json.loads(stars.to_json())["stars"]
              }


with open("assets/profile.json", "w") as write_file:
    json.dump(my_profile, write_file)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
# Se filtran los que no tienen etiqueta
reviews_labeled_df = reviews_df.loc[reviews_df['stars'].notnull()]
reviews_labeled_df.shape

(5000, 2)

In [48]:
reviews_labeled_df.describe()

Unnamed: 0,stars
count,5000.0
mean,3.7324
std,1.476762
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [49]:


X_train, X_test, y_train, y_test = train_test_split(reviews_labeled_df['text'], reviews_labeled_df['stars'], test_size = 0.2, stratify = reviews_labeled_df['stars'], random_state = 1)

X_train.shape

(4000,)

In [50]:
from Processor.TextProcessor import TextProcesser


estimators = [
    ("processer", TextProcesser()),
    ("vect", CountVectorizer(max_df = 0.5, min_df = 0.001)),
]
processing_pipeline = Pipeline(estimators)


In [51]:
X_train = processing_pipeline.fit_transform(X_train).toarray()
X_test = processing_pipeline.transform(X_test).toarray()

In [52]:
dump(processing_pipeline, "assets/pipeline.joblib")

['assets/pipeline.joblib']

In [53]:
from sklearn.linear_model import LogisticRegression

best_model = LogisticRegression(C=50,
 multi_class = 'auto',
 penalty= 'l2',
 solver= 'newton-cg',
 )


In [54]:
best_model.fit(X_train, y_train)

LogisticRegression(C=50, solver='newton-cg')

In [55]:

dump(best_model, "assets/model.joblib")

['assets/model.joblib']

In [56]:
preds_train = best_model.predict(X_train)
preds_test = best_model.predict(X_test)

In [57]:
from sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum()
    avg[-1] = total

    class_report_df['avg'] = avg

    return class_report_df.T

df_class_report = pandas_classification_report(y_test, preds_test)

df_class_report.to_json("assets/report.json")


In [58]:
vocabulary = processing_pipeline.steps[1][1].vocabulary_

coef = pd.DataFrame(best_model.coef_)
coef.columns = vocabulary

coef.to_json("assets/coefficients.json")

In [59]:
coef.head()


Unnamed: 0,hid,gem,heard,plac,facebook,going,are,decid,try,back,...,avenue,recovery,youth,silly,raspberry,diagnose,quieter,smallest,removal,continental
0,0.337641,0.453329,0.238561,-0.105164,0.197473,0.056879,0.108483,0.297773,0.014646,0.621482,...,-0.001439,0.728328,-0.246797,-0.381455,1.291041,-0.015679,-0.015679,0.164571,-0.026744,-0.026744
1,0.026949,0.047351,-0.054801,-0.002226,-0.196332,-0.049161,-0.07947,-0.080023,0.233351,0.154089,...,-0.000466,-0.195129,-0.220451,-0.482146,-0.176185,-0.047204,-0.047204,-0.10706,-0.025493,-0.025493
2,-0.859797,-0.379957,-0.398707,-0.001451,-0.304075,-7e-05,-0.3472,-0.244317,0.243933,-0.476007,...,-0.044347,-0.128989,-0.53324,0.991808,-0.140551,0.06286,0.06286,-0.027768,-0.0001,-0.0001
3,0.335877,-0.825875,-0.588897,0.312287,0.547068,0.028245,0.07598,-0.139477,-0.351679,-0.31259,...,0.012401,-0.077947,-0.315446,-0.360643,-0.250027,-0.269668,-0.269668,0.512324,0.045655,0.045655
4,0.159331,0.705152,0.803844,-0.203445,-0.244134,-0.035894,0.242208,0.166043,-0.140252,0.013026,...,0.03385,-0.326264,1.315935,0.232437,-0.724279,0.269692,0.269692,-0.542067,0.006682,0.006682


In [60]:
from sklearn.metrics import confusion_matrix

cm_test = pd.DataFrame(confusion_matrix(y_test, preds_test, labels = best_model.classes_))
cm_test_norm = pd.DataFrame(confusion_matrix(y_test, preds_test, labels = best_model.classes_, normalize = 'all'))

cm_test.to_json("assets/c_matrix.json")
cm_test_norm.to_json("assets/c_matrix_norm.json")


