In [None]:
import numpy as np
import pandas as pd

import os

import matplotlib.pyplot as plt
%matplotlib inline

import plotly.graph_objects as go

from sklearn.model_selection import cross_val_score

# Load data

In [None]:
tweets_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
tweets_train

In [None]:
tweets_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
tweets_test

In [None]:
#tweets = pd.concat([tweets_train, tweets_test])
#tweets = tweets[['keyword', 'location', 'text', 'target']].copy()
#tweets.reset_index(inplace=True)
#tweets

In [None]:
# tweets.info()

# Normalization

In [None]:
# I am going to replace Location with 1 or 0

for tweets in [tweets_train, tweets_test]:
    tweets['location'] = tweets['location'].apply(lambda x : 0 if x == None else 1)

In [None]:
# I am going to replace Keyword with 1 or 0

for tweets in [tweets_train, tweets_test]:
    tweets['keyword'] = tweets['keyword'].apply(lambda x : 0 if x == None else 1)

In [None]:
# Replace urls with 'url' code or just delete them

for tweets in [tweets_train, tweets_test]:
    tweets['text'] = tweets['text'].str.replace('http\S+|www.\S+', 'url', regex=True, case=False)

In [None]:
# Delete html tags

for tweets in [tweets_train, tweets_test]:
    tweets['text'] = tweets['text'].str.replace('<.*?>', '', regex=True, case=False)

In [None]:
# Replace mentions with 'mention' code or just delete them

for tweets in [tweets_train, tweets_test]:
    tweets['text'] = tweets['text'].str.replace('@[a-zA-Z0-9_]*', '', regex=True, case=False)

In [None]:
# I am going to keep the hashtags for now

#for tweets in [tweets_train, tweets_test]:
#    tweets.loc[:, 'text'] = tweets['text'].str.replace('#[a-zA-Z0-9_]*', '', regex=True, case=False)

In [None]:
# Now we are going to keep only letters and spaces

for tweets in [tweets_train, tweets_test]:
    tweets['text'] = tweets['text'].str.replace('[^a-zA-Z ]', '', regex=True)

# Word Tokenizer

In [None]:
from nltk.tokenize import word_tokenize

for tweets in [tweets_train, tweets_test]:
    tweets['text_words'] = tweets['text'].apply(lambda x : word_tokenize(x))

# Additional features

In [None]:
!pip install pyspellchecker

In [None]:
# Lets count the number of spelling errors on each tweet

from spellchecker import SpellChecker

spell = SpellChecker()

for tweets in [tweets_train, tweets_test]:
    tweets['spelling_mistakes'] = tweets['text_words'].apply(lambda x : len(spell.unknown(x)))

In [None]:
# Lets count the length of each tweet

for tweets in [tweets_train, tweets_test]:
    tweets['tweet_size'] = tweets['text'].apply(lambda x : len(x))

In [None]:
# Lets count the number of CAPITAL LETTERS

for tweets in [tweets_train, tweets_test]:
    tweets['tweet_caps'] = tweets['text'].apply(lambda x : sum(1 for c in x if c.isupper()))

# Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', token_pattern='[a-zA-Z]{3,15}')
vectorizer = vectorizer.fit(tweets_train['text'])

In [None]:
bag_of_words_train = vectorizer.transform(tweets_train['text'])

bag_of_words_test = vectorizer.transform(tweets_test['text'])

In [None]:
tweets_train['spelling_mistakes'] = tweets_train['spelling_mistakes'] / tweets_train['spelling_mistakes'].max()
tweets_train['tweet_size'] = tweets_train['tweet_size'] / tweets_train['tweet_size'].max()
tweets_train['tweet_caps'] = tweets_train['tweet_caps'] / tweets_train['tweet_caps'].max()

In [None]:
tweets_test['spelling_mistakes'] = tweets_test['spelling_mistakes'] / tweets_test['spelling_mistakes'].max()
tweets_test['tweet_size'] = tweets_test['tweet_size'] / tweets_test['tweet_size'].max()
tweets_test['tweet_caps'] = tweets_test['tweet_caps'] / tweets_test['tweet_caps'].max()

In [None]:
from scipy import sparse

for col in ['keyword', 'location', 'spelling_mistakes', 'tweet_size', 'tweet_caps']:
    
    bag_of_words_train = sparse.hstack((
        bag_of_words_train, 
        np.array(tweets_train[col]).reshape(-1, 1)
    ))
    
    bag_of_words_test = sparse.hstack((
        bag_of_words_test, 
        np.array(tweets_test[col]).reshape(-1, 1)
    ))

# Training and Validation Datasets

In [None]:
X_train = bag_of_words_train.toarray()
y_train = tweets_train['target']

X_val = bag_of_words_test.toarray()

In [None]:
#from sklearn.preprocessing import MinMaxScaler

#scaler = MinMaxScaler()

#X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)

# WordClouds

In [None]:
from wordcloud import WordCloud

In [None]:
# Disaster WordCloud
'''
disaster_sum_words = X_train.loc[:,'abc':].sum(axis=0)
disaster_sum_words.sort_values(ascending=False)

wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", repeat=False)
wordcloud.generate_from_frequencies(disaster_sum_words)

plt.figure(figsize = (10, 10))
plt.title("Top words in disaster tweets")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
'''

In [None]:
# Non-Disaster WordCloud
'''
disaster_sum_words = X_val.loc[:,'abc':].sum(axis=0)
disaster_sum_words.sort_values(ascending=False)

wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white", repeat=False)
wordcloud.generate_from_frequencies(disaster_sum_words)

plt.figure(figsize = (10, 10))
plt.title("Top words in non-disaster tweets")
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
'''

# Models

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=5000, C=10)

# check the accuracy with cross validation
scores = cross_val_score(lr_model, X_train, y_train, cv=5)

# now lets train our final model
lr_model.fit(X_train, y_train)

final_score = lr_model.score(X_train, y_train)

print(f"Scores: {scores} \nMean: {scores.mean()} \nFinal Score: {final_score}")

## 2. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nbmodel = GaussianNB()

# check the accuracy with cross validation
scores = cross_val_score(nbmodel, X_train, y_train, cv=5)

# now lets train our final model
nbmodel.fit(X_train, y_train)

final_score = nbmodel.score(X_train, y_train)

print(f"Scores: {scores} \nMean: {scores.mean()} \nFinal Score: {final_score}")

## 3. KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_values = [x for x in range(1, 20, 2)]
scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_train, y_train))

fig = go.Figure()
fig.add_trace(go.Scatter(x=k_values, y=scores, mode='lines+markers'))
fig.update_layout(xaxis_title="knn score", yaxis_title="k")
fig.show()

In [None]:
# define the model
knn_model = KNeighborsClassifier(n_neighbors=5)

# check the accuracy with cross validation
scores = cross_val_score(knn_model, X_train, y_train, cv=5)

# now lets train our final model
knn_model.fit(X_train, y_train)

final_score = knn_model.score(X_train, y_train)

print(f"Scores: {scores} \nMean: {scores.mean()} \nFinal Score: {final_score}")

## 3. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)

rfc_model.fit(X_train, y_train)

scores = cross_val_score(rfc_model, X_train, y_train, cv=5)

final_score = rfc_model.score(X_train, y_train)

print(f"Scores: {scores} \nMean: {scores.mean()} \nFinal Score: {final_score}")

## 4. Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

# bayes ingenuo multinomiales
mnb_model = MultinomialNB()

mnb_model.fit(X_train, y_train)

scores = cross_val_score(mnb_model, X_train, y_train, cv=5)

final_score = mnb_model.score(X_train, y_train)

print(f"Scores: {scores} \nMean: {scores.mean()} \nFinal Score: {final_score}")

## 6. Neural Network

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
X_train.shape[1]

In [None]:
input_shape = X_train.shape[1]

# definicion del modelo
model = keras.Sequential(
    [
        keras.Input(shape=(input_shape)),
        layers.Dense(128, activation="relu"),
        layers.Dense(64, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

# Construir el modelo y ver la arquitectura
model.build(input_shape)
model.summary()

In [None]:
# Compilation
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# ejecutar training
history = model.fit(X_train, y_train, epochs=10, verbose=1, validation_split=0.4)

# Predictions

In [None]:
predictions = model.predict(X_val)
predictions

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sample_submission['target'] = predictions
sample_submission['target'] = sample_submission['target'].apply(lambda x : 1 if x > .8 else 0)

sample_submission.to_csv("sample_submission.csv", index=False)

In [None]:
sample_submission.head()

In [None]:
sample_submission