In [14]:
import io
import sys
import os
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from utils.text_analysis import TextAnalysis
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



In [15]:
ta = TextAnalysis('es')
data_raw = pd.read_csv('data.csv')
data_raw.iloc[500:1000]

Language: es
Text Analysis: ['emoji', 'tagger', 'parser', 'stemmer', 'ner']


Unnamed: 0,user_id,tweets,is_human
500,b8d7ac3042d45731e05ad00d56e0f8fe,['RT @PoeticaAcciones: Hay que dedicarle más t...,1
501,9494c5001fbfb868f34a737f69e5ac62,['@duxativa .Esta es la manera mas eficaz de e...,1
502,b8528a4f489fded49709a7bb94d8e25f,['En Adventistas.cl: Amigos de Esperanza en Nu...,0
503,de5c5734d56acc5da30f086f95646805,['El PRD presentará mañana ante el Instituto F...,0
504,7960cc1f636243b5d8f22efeb0dec6e9,['El Gobierno deniega el indulto a ‘los Albert...,0
...,...,...,...
995,3531c7bb89c410da8592642c32a5e1f8,['Como cuando tu ídolo no te deja en visto htt...,1
996,1b8ebbab6ebe891a66c18e3965bf3fb9,['#ad Anbang retira de la puja por Starwood y ...,0
997,5631ded2b52f1692e77de2eb8d89c3d3,['RT @accionlibertad: #RostrosDeLaInjusticia |...,1
998,a5c1637f723544a64c4da1b8ddfb3dd8,['RT @hurgamemoriaPE: Si tiene tan buena recom...,1


In [16]:
setting = {'url': True, 'mention': True, 'emoji': True, 'hashtag': True, 'stopwords': False, 'relabel': True} 
list_sentences = []
for row in tqdm(data_raw['tweets'].to_list()):
    text = ta.clean_text(row, **setting)
    #print('Text org: {0} \nTex clean: {1}'.format(row, text))
    list_sentences.append(text)

100%|██████████| 3000/3000 [00:21<00:00, 138.51it/s]


In [18]:
 x = list_sentences
 y = data_raw['is_human'].to_list()

In [21]:
 print('**Replica y_train:', sorted(Counter(y).items()))

**Replica y_train: [(0, 1500), (1, 1500)]


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=8675309)
print('**Replica train: {0}, size {1}'.format(sorted(Counter(y_train).items()), len(y_train)))
print('**Replica test: {0}, size {1}'.format(sorted(Counter(y_test).items()), len(y_test)))

**Replica train: [(0, 1050), (1, 1050)], size 2100
**Replica test: [(0, 450), (1, 450)], size 900


# Feature in Bag of words

In [25]:
vec = CountVectorizer(min_df=5, ngram_range=(1,3), max_features=5000, strip_accents='unicode', lowercase =True, analyzer='word')
vec.fit(x_train)
x_train = vec.transform(x_train)
x_test = vec.transform(x_test)


In [26]:
print(vec.get_feature_names())

on', 'open', 'operacion', 'opinion', 'oportunidad', 'oportunidades', 'oposicion', 'or', 'orden', 'organizacion', 'orgullo', 'origen', 'original', 'orlando', 'oro', 'ortega', 'os', 'oscar', 'otra', 'otra vez', 'otras', 'otro', 'otros', 'our', 'out', 'oye', 'pa', 'pablo', 'paciencia', 'pacto', 'padre', 'padres', 'paga', 'pagar', 'pagina', 'pago', 'pais', 'pais url', 'paises', 'palabra', 'palabras', 'pan', 'pantalla', 'papa', 'papas', 'papel', 'par', 'par de', 'para', 'para el', 'para el de', 'para este', 'para evitar', 'para hacer', 'para hashtag', 'para hashtag de', 'para hoy', 'para hoy en', 'para ir', 'para la', 'para laborar', 'para laborar en', 'para las', 'para los', 'para mention', 'para mi', 'para no', 'para poder', 'para que', 'para que no', 'para ser', 'para su', 'para ti', 'para todos', 'para un', 'para una', 'para ver', 'paraguay', 'parcialmente', 'parcialmente nublado', 'parcialmente nublado hashtag', 'parcialmente nublado mas', 'parece', 'parece que', 'parecen', 'parecer', 

In [27]:
print(x_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# Random Over Sampler

In [29]:
ros = RandomOverSampler(random_state=1000)
x_train, y_train = ros.fit_resample(x_train, y_train)
x_test, y_test = ros.fit_resample(x_test, y_test)
print('**RandomOverSampler train:', sorted(Counter(y_train).items()))
print('**RandomOverSampler test:', sorted(Counter(y_test).items()))

**RandomOverSampler train: [(0, 1064), (1, 1064)]
**RandomOverSampler test: [(0, 464), (1, 464)]


In [30]:
classifier = LogisticRegression(C=10, solver='lbfgs', multi_class='multinomial',max_iter=1000) 
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

# Metrics

In [32]:
print('\nConfusion Matrix')
print(confusion_matrix(y_test, y_pred))





Confusion Matrix
[[446  18]
 [ 16 448]]


In [36]:
print('\nClasification Report')
print(classification_report(y_test, y_pred))
cv_score = np.mean(cross_val_score(classifier, x_train,y_train, cv=5, scoring='accuracy'))
print('fin')


Clasification Report
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       464
           1       0.96      0.97      0.96       464

    accuracy                           0.96       928
   macro avg       0.96      0.96      0.96       928
weighted avg       0.96      0.96      0.96       928

fin


In [37]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy: {}%'.format(round(accuracy, 2)*100))
print('Recall: {}%'.format(round(recall, 2)*100))
print('Precision: {}%'.format(round(precision, 2)*100))
print('F1: {}%'.format(round(f1, 2)*100))

Accuracy: 96.0%
Recall: 96.0%
Precision: 96.0%
F1: 96.0%
