<a href="https://colab.research.google.com/github/stefano2211/RepositorioDataScience/blob/main/KingLeagueModelPredictionTweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import snscrape.modules.twitter as sntwitter
from sklearn.feature_extraction.text import CountVectorizer
import neattext as nt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from google.colab import files
import io

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#Analisis de los datos

In [38]:
df = pd.read_csv('KingsLeagueData.csv')

In [39]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Tweet,Subjectivity,Polarity,Analysis
0,0,2023-02-18 20:17:36+00:00,aldamoooon KingsLeague gerardpique Spursito Ik...,0.0,0.0,Neutral
1,1,2023-02-18 20:17:02+00:00,KingsLeague PorcinosFC kfutbolclub gerardpique...,0.166667,0.0,Neutral
2,2,2023-02-18 20:15:26+00:00,adanilo KingsLeague gerardpique Spursito IkerC...,0.066667,-0.166667,Negative
3,3,2023-02-18 20:13:58+00:00,UltMostoles Juanmaglez iangonzalezzz ArcheAlva...,0.0,0.0,Neutral
4,4,2023-02-18 20:13:00+00:00,UltMostoles JijantesFC KingsLeague Primera win...,0.366667,0.525,Positive


In [40]:
df.drop(['Subjectivity', 'Polarity', 'Unnamed: 0'], axis=True)

Unnamed: 0,Date,Tweet,Analysis
0,2023-02-18 20:17:36+00:00,aldamoooon KingsLeague gerardpique Spursito Ik...,Neutral
1,2023-02-18 20:17:02+00:00,KingsLeague PorcinosFC kfutbolclub gerardpique...,Neutral
2,2023-02-18 20:15:26+00:00,adanilo KingsLeague gerardpique Spursito IkerC...,Negative
3,2023-02-18 20:13:58+00:00,UltMostoles Juanmaglez iangonzalezzz ArcheAlva...,Neutral
4,2023-02-18 20:13:00+00:00,UltMostoles JijantesFC KingsLeague Primera win...,Positive
...,...,...,...
2995,2023-02-16 16:21:06+00:00,KingsLeague Beguer El Barrio,Neutral
2996,2023-02-16 16:20:27+00:00,KingsLeague El ms grande de la historia,Neutral
2997,2023-02-16 16:18:22+00:00,KingsLeague Solo han ganado un partido,Negative
2998,2023-02-16 16:17:22+00:00,KingsLeague Me gusta revelarme,Neutral


In [41]:
#Quitamos las palabras que no aportan informacion
def textdrop(text):

  sw = stopwords.words('spanish')

  text = [word.lower() for word in text.split() if word.lower() not in sw]

  return " ".join(text)

In [42]:
df['Tweet'] = df['Tweet'].apply(textdrop)

In [43]:
#Contamos las clases a predecir 
df.Analysis.value_counts()

Neutral     1871
Positive     792
Negative     337
Name: Analysis, dtype: int64

#Preprocesado de los datos

In [44]:
#Vectorizamos los tweets
vector = CountVectorizer()
vector.fit(df.Tweet)
x = vector.transform(df.Tweet)

In [45]:
#Conversion de las clases str a int
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()
df['Analysis'] = encoder.fit_transform(df['Analysis'])

In [46]:
y = df.Analysis

#Modelado

In [47]:
#Balanceo de clases
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(x, y)

In [50]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split




models = []
models.append(('LoR', LogisticRegression(solver="lbfgs", max_iter=1000)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=8)))
models.append(('CAR', DecisionTreeClassifier(random_state=5)))




results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = KFold(n_splits=20)
    result =  cross_val_score(model, x, y,  cv=kfold)
    results.append(result)
    names.append(name)
    print(f"{name}: {result.mean()*100.0:,.2f} ({result.std()*100.0:,.2f})")

LoR: 75.90 (4.60)
KNN: 65.93 (6.47)
CAR: 72.97 (5.57)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30 )
svc = LogisticRegression(solver="saga", max_iter=2000)
svc.fit(X_train, y_train)
predicted = svc.predict(X_test)
cohen_score = cohen_kappa_score(y_test, predicted)
print(f"Cohens Score: {cohen_score*100.0:,.2f}")

Cohens Score: 52.13


In [114]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30 )
svc = KNeighborsClassifier(n_neighbors=10, algorithm='brute')
svc.fit(X_train, y_train)
predicted = svc.predict(X_test)
cohen_score = cohen_kappa_score(y_test, predicted)
print(f"Cohens Score: {cohen_score*100.0:,.2f}")

Cohens Score: 12.40


In [144]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30 )
svc = DecisionTreeClassifier(random_state=10, criterion='entropy', splitter='random')
svc.fit(X_train, y_train)
predicted = svc.predict(X_test)
cohen_score = cohen_kappa_score(y_test, predicted)
print(f"Cohens Score: {cohen_score*100.0:,.2f}")

Cohens Score: 53.52
