In [21]:
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install --disable-pip-version-check --progress-bar off -q https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
!pip install -U pandas-profiling



In [3]:
seed = 161

import pandas as pd
import warnings; warnings.simplefilter('ignore')
import nltk

from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.pipeline import Pipeline

from joblib import dump, load


In [4]:
# Descargando las stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/andres/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andres/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/andres/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from sklearn.utils import resample



In [6]:
# Se cargan los datos. 

file_name = 'reviews_train_val.csv'
reviews_df = pd.read_csv('data/reviews_train_val.csv', sep = ',')

neg_class_resampled = resample(
    reviews_df, replace=False, n_samples=1000,
    random_state=1234,
)
reviews_df = neg_class_resampled.reset_index(drop=True)

In [62]:
import json
from pandas_profiling import ProfileReport
##profile = ProfileReport(reviews_df, title='Profiling Report', explorative=True)

profile = reviews_df.profile_report()
p = profile.to_json()
y = json.loads(p)

stars = reviews_df.describe()



my_profile = {"count": y['table']['n'], "n_var" : y['table']["n_var"],
              "n_cells_missing": y['table']["n_cells_missing"],
"n_vars_with_missing": y['table']["n_vars_with_missing"],
"n_vars_all_missing": y['table']["n_vars_all_missing"],
"p_cells_missing": y['table']["p_cells_missing"],
"n_duplicates": y['table']["n_duplicates"],
"p_duplicates": y['table']["p_duplicates"],

              "stars": json.loads(stars.to_json())["stars"]
              }


with open("assets/profile.json", "w") as write_file:
    json.dump(my_profile, write_file)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

In [63]:
# Se filtran los que no tienen etiqueta
reviews_labeled_df = reviews_df.loc[reviews_df['stars'].notnull()]
reviews_labeled_df.shape

(1000, 2)

In [8]:
reviews_labeled_df.describe()

Unnamed: 0,stars
count,1000.0
mean,3.736
std,1.48206
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [9]:


X_train, X_test, y_train, y_test = train_test_split(reviews_labeled_df['text'], reviews_labeled_df['stars'], test_size = 0.2, stratify = reviews_labeled_df['stars'], random_state = 1)

X_train.shape

(800,)

In [10]:
from Processor.TextProcessor import TextProcesser


estimators = [
    ("processer", TextProcesser()),
    ("vect", CountVectorizer(max_df = 0.5, min_df = 0.001)),
]
processing_pipeline = Pipeline(estimators)


In [11]:
X_train = processing_pipeline.fit_transform(X_train).toarray()
X_test = processing_pipeline.transform(X_test).toarray()

In [12]:
dump(processing_pipeline, "assets/pipeline.joblib")

['assets/pipeline.joblib']

In [13]:
from sklearn.linear_model import LogisticRegression

best_model = LogisticRegression(C=50,
 multi_class = 'auto',
 penalty= 'l2',
 solver= 'newton-cg',
 )


In [14]:
best_model.fit(X_train, y_train)

LogisticRegression(C=50, solver='newton-cg')

In [15]:

dump(best_model, "assets/model.joblib")

['assets/model.joblib']

In [16]:
preds_train = best_model.predict(X_train)
preds_test = best_model.predict(X_test)

In [17]:
from sklearn.metrics import precision_recall_fscore_support


def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred)

    avg = list(precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)

    support = class_report_df.loc['support']
    total = support.sum()
    avg[-1] = total

    class_report_df['avg'] = avg

    return class_report_df.T

df_class_report = pandas_classification_report(y_test, preds_test)

df_class_report.to_json("assets/report.json")


In [18]:
vocabulary = processing_pipeline.steps[1][1].vocabulary_

coef = pd.DataFrame(best_model.coef_)
coef.columns = vocabulary

coef.to_json("assets/coefficients.json")

In [19]:
coef.head()


Unnamed: 0,got,vitamin,b12,don,debby,gav,amaz,energy,boost,felt,...,threading,sir,zhar,thread,sarees,india,karissima,zharee,fascin,fascinate
0,-2.5e-05,-0.030247,0.075612,-0.084959,-0.000345,-0.001015,-2e-06,-2.706658e-07,-0.054922,-1.7e-05,...,2e-06,-0.003137818,-0.022785,-0.008994,0.00301669,0.00301669,0.02264988,0.02264988,8.577819e-08,-0.00173
1,-0.007309,-0.016717,0.103544,0.008295,-0.007429,-0.006864,-0.005756,-0.04190186,-0.018415,-0.000205,...,-0.016588,0.02598167,-0.053668,0.061913,-0.002599313,-0.002599313,-0.01842878,-0.01842878,0.04895428,-0.000209
2,-0.010976,0.064067,-0.258629,0.000727,-0.003673,-1.3e-05,-6e-06,0.04454464,-0.036398,-0.00067,...,-9.5e-05,-0.007213409,-0.057267,0.012053,-0.0004180863,-0.0004180863,-1.89356e-05,-1.89356e-05,-1.728936e-06,-0.000135
3,0.078862,-0.013893,0.462883,0.186397,-0.14956,0.007907,-0.000631,-1.401223e-06,-0.092517,0.043735,...,-0.007348,-0.0156303,0.55368,-0.041999,3.975529e-07,3.975529e-07,-0.004201871,-0.004201871,-0.04573214,-0.023497
4,-0.060551,-0.00321,-0.383409,-0.11046,0.161007,-1.5e-05,0.006396,-0.002641109,0.202252,-0.042842,...,0.02403,-1.438166e-07,-0.41996,-0.022973,3.121459e-07,3.121459e-07,-2.95756e-07,-2.95756e-07,-0.003220504,0.025571


In [20]:
from sklearn.metrics import confusion_matrix

cm_test = pd.DataFrame(confusion_matrix(y_test, preds_test, labels = best_model.classes_))
cm_test_norm = pd.DataFrame(confusion_matrix(y_test, preds_test, labels = best_model.classes_, normalize = 'all'))

cm_test.to_json("assets/c_matrix.json")
cm_test_norm.to_json("assets/c_matrix_norm.json")


