In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import os

from wordcloud import WordCloud


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from bs4 import BeautifulSoup
!pip install contractions
import contractions
from textblob import TextBlob
import spacy

# Let's import train and test set

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = [8,4]
plt.rcParams['figure.dpi'] = 120

In [None]:
sns.countplot('target', data = train)
plt.title('Real or not Disaster tweets')

In [None]:
# Functions for clean data

from spacy.lang.en.stop_words import STOP_WORDS as stopwords
global stopwords

def get_words_counts(tweets):
        length = len(str(tweets).split())
        return length
    
def get_chars_counts(tweets):
        string = tweets.split()
        x = ''.join(string)
        return len(x)
    
def get_average_word_length(tweets):
        count = get_chars_counts(tweets)/get_words_counts(tweets)
        return count
    
def get_stopwords_count(tweets):
        global stopwords
        stopwords = len([t for t in tweets.split() if t in stopwords])
        return stopwords
    
def get_hashtags_tags(tweets):
        hashtags = len([t for t in tweets.split() if t.startswith('#')])
        return hashtags
        
def get_email_tags(tweets):
        email = len([t for t in tweets.split() if t.startwith('@')])
        return email
    
def get_digit_counts(tweets):
        digits = re.findall(r'[0-9,.]+', tweets)
        return digits
    
def get_uppercase_units(tweets):
        uppercase = len([t for t in tweets.split() if t.isupper()])
        return uppercase

In [None]:
# With this function we can get some features for build a bit EDA

def get_features(df):
    if type(df) == pd.core.frame.DataFrame:
        df['words_counts'] = df['text'].apply(lambda x: get_words_counts(x))
        df['char_counts'] = df['text'].apply(lambda x: get_chars_counts(x))
        df['average_word_length'] = df['text'].apply(lambda x: get_average_word_length(x))
#        df['stopwords_counts'] = df['text'].apply(lambda x: get_stopwords_count(x))
        df['hashtags_counts'] = df['text'].apply(lambda x: get_hashtags_tags(x))
#        df['email_counts'] = df['text'].apply(lambda x: get_email_tags(x))
        df['digits_counts'] = df['text'].apply(lambda x: get_digit_counts(x))
        df['uppercase_counts'] = df['text'].apply(lambda x: get_uppercase_units(x))
        
    else:
        print('ERROR')
        
    return df

In [None]:
train = pd.DataFrame(train)

In [None]:
train = get_features(train)

In [None]:
train

# EDA

In [None]:
sns.distplot(train['char_counts'])

In [None]:
sns.kdeplot(train[train['target'] == 1]['char_counts'], shade=True, color='red')
sns.kdeplot(train[train['target'] == 0]['char_counts'], shade=True, color= 'blue')

In [None]:
sns.catplot(y='char_counts', data=train, kind='violin', col='target')

# Data Cleaning

In [None]:
def remove_tweet_username(df):
    return re.sub('@[^\s]+','', df)

def make_lower(df):
    return df.lower()

def cont_exp(df):
    return contractions.fix(df)

def make_string(df):
    return str(df)

def remove_url(df):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , df)

def remove_email(df):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", df)

#Retweets
def remove_rt(df):
    df = str(df)
    return re.sub(r'\brt\b', "", df).strip()

def remove_html(df):
    return BeautifulSoup(df, 'lxml').get_text().strip()


def remove_dots(df):
    dot_pattern = re.compile(r'\.{1,}')
    single_dot = dot_pattern.sub(' ', df)
    return single_dot

def remove_special_chars(df):
    df = re.sub(r'[^\w]+', " ", df)
    df = ' '.join(df.split())
    
    return df

def make_base(df):
    df = str(df)
    x_list = []
    doc = nlp(df)

    for token in doc:
        lemma = token.lemma_
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text

        x_list.append(lemma)
    return ' '.join(x_list)

def spelling_correction(df):
    df = TextBlob(df).correct()
    return df

def resub(df):
    return re.sub("(.)\\1{2,}", "\\1", df)



def get_clean_data(df):
    df = remove_url(df)
    df = remove_email(df)
#    df = remove_special_chars(df)
    df = remove_html(df)
#    df = remove_dots(df)
#    df = make_base(df)
#    df = spelling_correction(df).raw_sentences[0]
    df = make_lower(df)
    df = make_string(df)
    df = cont_exp(df)
    df = remove_rt(df)
    df = resub(df)                             
    df = remove_tweet_username(df)
    
    return df

In [None]:
train['text'] = train['text'].apply(get_clean_data)

In [None]:
test['text'] = test['text'].apply(get_clean_data)

In [None]:
train['text'] = train['text'].apply(remove_special_chars)
test['text'] = test['text'].apply(remove_special_chars)

In [None]:
train['text'].head(20)

In [None]:
from gensim.parsing.preprocessing import STOPWORDS

In [None]:
STOPWORDS.difference()

all_stopwords_gensim = STOPWORDS
sw_list = {"not"}
all_stopwords_gensim = STOPWORDS.difference(sw_list)

In [None]:
# Use Gensim

from gensim.parsing.preprocessing import remove_stopwords

train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)


In [None]:
train.text

In [None]:
# trying to remove all digits from a columns

train['text'] = train['text'].str.replace('\d+', '')
test['text'] = test['text'].str.replace('\d+', '')

In [None]:
train['text'].head(20)

# WordClouds

In [None]:
def get_word_freqs(df, col):
    text = ' '.join(df[col])
    text = text.split()
    freq = pd.Series(text).value_counts()
    return freq

In [None]:
real_data = get_word_freqs(train[train['target']==1], 'text')
real_data = ' '.join(real_data.index)
real_data

In [None]:
wordcloud = WordCloud().generate(real_data)
plt.imshow(wordcloud)
plt.axis('off')
plt.imshow

In [None]:
not_real_data = get_word_freqs(train[train['target']==0], 'text')
not_real_data = ' '.join(not_real_data.index)
not_real_data

In [None]:
wordcloud = WordCloud().generate(not_real_data)
plt.imshow(wordcloud)
plt.axis('off')
plt.imshow

# TF_IDF

In [None]:
text = train['text']
y = train['target']

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(text)

In [None]:
X.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 7, stratify=y)

## Support Vector Machines

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf_svm = LinearSVC()

In [None]:
def run_SVM(clf_svm, X_train, X_test, y_train, y_test):
    clf_svm.fit(X_train, y_train)
    y_pred = clf_svm.predict(X_test)
    
    print()
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

In [None]:
run_SVM(clf_svm, X_train, X_test, y_train, y_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression()

In [None]:
def run_LR(clf_lr, X_train, X_test, y_train, y_test):
    clf_lr.fit(X_train, y_train)
    y_pred = clf_lr.predict(X_test)
    
    print()
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

In [None]:
run_LR(clf_lr, X_train, X_test, y_train, y_test)

## K_Nearest_Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier()

In [None]:
def run_knn(clf_knn, X_train, X_test, y_train, y_test):
    clf_knn.fit(X_train, y_train)
    y_pred = clf_knn.predict(X_test)
    
    print()
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

In [None]:
run_knn(clf_knn, X_train, X_test, y_train, y_test)

## Multinominal Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf_mnb = MultinomialNB()

In [None]:
def run_mnb(clf_mnb, X_train, X_test, y_train, y_test):
    clf_mnb.fit(X_train, y_train)
    y_pred = clf_mnb.predict(X_test)
    
    print()
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

In [None]:
run_mnb(clf_mnb, X_train, X_test, y_train, y_test)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()

In [None]:
def run_tree(clf_tree, X_train, X_test, y_train, y_test):
    clf_tree.fit(X_train, y_train)
    y_pred = clf_tree.predict(X_test)
    
    print()
    print('Classification Reprot:')
    print(classification_report(y_test, y_pred))

In [None]:
run_tree(clf_tree, X_train, X_test, y_train, y_test)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier()

In [None]:
def run_forest(clf_forest, X_train, X_test, y_train, y_test):
    clf_forest.fit(X_train, y_train)
    y_pred = clf_forest.predict(X_test) 
    
    print()
    print("Classificiation Report:")
    print(classification_report(y_test, y_pred))

In [None]:
run_forest(clf_forest, X_train, X_test, y_train, y_test)

In [None]:
random_forest_score = clf_forest.score(X_test, y_test)

In [None]:
random_forest_score

In [None]:
models_default_tfidf = {'Support Vector Machines': clf_svm.score(X_test, y_test),
          'Logistic Regression': clf_lr.score(X_test, y_test),
          'KNearest Neightbors': clf_knn.score(X_test, y_test),
          'Multinominal Naive Bayes': clf_mnb.score(X_test, y_test),
          'Decision Tree': clf_tree.score(X_test, y_test),
          'Random Forest Classifier': clf_forest.score(X_test, y_test)}

In [None]:
models_default_tfidf

In [None]:
default_models_compare = pd.DataFrame(models_default_tfidf, index=['accuracy'])
default_models_compare.T.plot.bar()

## Word2Vec

In [None]:
import en_core_web_lg

In [None]:
nlp = en_core_web_lg.load()

In [None]:
def get_vec(x):
    doc = nlp(x)
    vec = doc.vector
    return vec

In [None]:
train['vectors'] = train['text'].apply(lambda x: get_vec(x))

In [None]:
train.head()

In [None]:
X = train['vectors'].to_numpy()
X = X.reshape(-1, 1)

In [None]:
X.shape

In [None]:
X = np.concatenate(np.concatenate(X, axis = 0), axis = 0).reshape(-1, 300)

In [None]:
X.shape

In [None]:
X

In [None]:
### Normalization

from sklearn.preprocessing import normalize

X = normalize(X)
X

In [None]:
y = train['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 7, stratify=y)

In [None]:
def word2vec_models(clf, X_train, X_test, y_train, y_test):
    print('SVM MODEL:')
    run_SVM(clf_svm, X_train, X_test, y_train, y_test)
    print('LOGISTIC REGRESSION MODEL:')
    run_LR(clf_lr, X_train, X_test, y_train, y_test)
    print('KNEAREST NEIGHBORS MODEL:')
    run_knn(clf_knn, X_train, X_test, y_train, y_test)
    #run_mnb(clf_mnb, X_train, X_test, y_train, y_test)
    print('DECISION TREE MODEL:')
    run_tree(clf_tree, X_train, X_test, y_train, y_test)
    print('RANDOM FOREST MODEL:')
    run_forest(clf_forest, X_train, X_test, y_train, y_test)
    
    return word2vec_models
    

In [None]:
clf_list = [run_SVM, run_LR, run_knn, run_mnb, run_tree, run_forest]

In [None]:
word2vec_models(clf_list, X_train, X_test, y_train, y_test)

In [None]:
models_default_word2vec = {'Support Vector Machines': clf_svm.score(X_test, y_test),
          'Logistic Regression': clf_lr.score(X_test, y_test),
          'KNearest Neightbors': clf_knn.score(X_test, y_test),
          'Decision Tree': clf_tree.score(X_test, y_test),
          'Random Forest Classifier': clf_forest.score(X_test, y_test)}

In [None]:
models_default_word2vec

In [None]:
models_default_tfidf

# Hyperparameter tuning

In [None]:
# Logistic Regression?

In [None]:
print(clf_lr.get_params().keys())

In [None]:
%%time

from sklearn.model_selection import GridSearchCV

param_grid_lr = {'C': np.logspace(-2,2,8),
                 'random_state': [7],
                 'penalty': ['l2', 'l1'],
                 'solver': ['liblinear','sag', 'saga']}

grid_lr = GridSearchCV(clf_lr, param_grid_lr, cv=10, n_jobs = -1)

grid_lr.fit(X_train, y_train)


In [None]:
grid_lr.best_params_

In [None]:
grid_lr.best_score_

In [None]:
#####################################################################

In [None]:
#####################################################################

In [None]:
test['vectors'] = test['text'].apply(lambda x: get_vec(x))

In [None]:
X_test

In [None]:
testX = test['vectors'].to_numpy()

In [None]:
testX = testX.reshape(-1, 1)

In [None]:
testX.shape

In [None]:
testX = np.concatenate(np.concatenate(testX, axis = 0), axis=0).reshape(-1, 300)

In [None]:
# from sklearn.preprocessing import normalize

X_norm = normalize(testX)


In [None]:
X_norm

In [None]:
X_norm.shape

In [None]:
df = pd.DataFrame(test['id'])

In [None]:
df

In [None]:
final = grid_lr.predict(X_norm)

In [None]:
df['target'] = final

In [None]:
df

In [None]:
df.to_csv('Disaster_tweet.csv', index=False)

In [None]:
check = pd.read_csv('Disaster_tweet.csv')

In [None]:
check.head()

In [None]:
check.target.value_counts()