In [None]:
!pip install contractions

In [None]:
!pip install missingno

In [None]:
!pip install pyspellchecker

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from unicodedata import normalize
import emoji
from textblob import TextBlob
from nltk.corpus import stopwords
import wordsegment as ws
ws.load()
import contractions
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer
import missingno as msno

# Reading the training and testing dataset

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

# Basic info about training data

In [None]:
train_df.info()

# Dropping the ids columns as they are not needed

In [None]:
train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

# Combining the training and testing dataset

In [None]:
df = train_df.append(test_df)
df

# Reseting the index from 0 to length of dataframe

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df['keyword'].unique()

# Most of the keywords have %20 in them as noise

# Visualizing the missing values in the complete dataset

In [None]:
msno.matrix(df)

In [None]:
msno.heatmap(df)

# Location has the most missing values and looking at the heatmap, target is not affected in any way by location, so we drop it

In [None]:
df.drop('location', axis=1, inplace=True)

# Removing the %20 in the keyword feature

In [None]:
df['keyword'] = df['keyword'].str.replace('%20', ' ')

In [None]:
df.info()

# As the keyword column specifies the keyword in the tweet, therefore filling the missing values with 'zero'

In [None]:
df['keyword'] = df['keyword'].fillna('zero')

# Loading the stop words 

In [None]:
stop_words = set(stopwords.words('english'))

# Preprocessing the text column having all the tweets

# Function for removing the hashtags and splitting the multi word hashtag by segmenting it

In [None]:
def extract_words(tweet):
    hashtags = re.findall(r"(#\w+)", tweet)
    for hs in hashtags:
        words = " ".join(ws.segment(hs))
        tweet = tweet.replace(hs, words)
    return tweet

# Function for removing the urls from tweets

In [None]:
def remove_urls(tweet):
    return re.sub(r'http\S+', '', tweet)

# Function for removing numbers from the tweets

In [None]:
def remove_numbers(tweet):
    return re.sub(r'[^\D\.]', '', tweet)

# Function for removing user mentions from the tweets

In [None]:
def remove_usermentions(tweet):
    return re.sub(r'@(\w+)', '', tweet)

# Function for removing stop words from the tweets

In [None]:
def remove_stop_words(df, index, column):
    text = ''

    words = df[column][index].split(' ')
    
    for i in range(len(words)):
        if words[i] not in stop_words:
            text += words[i]
            text += ' '
    text = text.rstrip()
    df.at[index, column] = text
    
    return df[column][index]

# Functions for removing punctuations

In [None]:
def remove_punctuations(tweet):
    return re.sub(r'[^\w\s]', '', tweet)

# Function for creating stems of all the words in the tweets

In [None]:
def stem_words(tweet):
    ps = PorterStemmer()
    
    word_list = tweet.split()
    
    stems = ' '.join([ps.stem(w) for w in word_list])
    
    return stems

# All the preprocessing is applied in this loop

1- First we remove the unicode characters
2- Then we remove the user mentions
3- Then we remove the hashtags
4- Then remove the contractions
5- Then we convert the emojis to their full text form
6- Then we remove urls
7- Then we remove numbers
8- Then we remove punctuations
9- Then we remove the stop words
10- Then we lower case the words
11- Then we remove unwanted spaces
12- Then we create stems for all the words in tweets

In [None]:
for index in df.index:
    
    # remove unicode characters
    df.at[index, 'text'] = (normalize('NFKD', df['text'][index]).encode('ascii','ignore')).decode('utf-8')
    df.at[index, 'keyword'] = (normalize('NFKD', df['keyword'][index]).encode('ascii','ignore')).decode('utf-8')
    
    # remove user mentions
    df.at[index, 'text'] = remove_usermentions(df['text'][index])
    df.at[index, 'keyword'] = remove_usermentions(df['keyword'][index])
    
    # remove hashtags and splits the words
    df.at[index, 'text'] = extract_words(df['text'][index])
    df.at[index, 'keyword'] = extract_words(df['keyword'][index])
    
    # remove contractions
    df.at[index, 'text'] = contractions.fix(df['text'][index])
    df.at[index, 'keyword'] = contractions.fix(df['keyword'][index])
    
    # convert emojis into text
    df.at[index, 'text'] = emoji.demojize(df['text'][index], delimiters=("", ""))
    df.at[index, 'keyword'] = emoji.demojize(df['keyword'][index], delimiters=("", ""))
   
    # remove urls
    df.at[index, 'text'] = remove_urls(df['text'][index])
    df.at[index, 'keyword'] = remove_urls(df['keyword'][index])
    
    # remove numbers
    df.at[index, 'text'] = remove_numbers(df['text'][index])
    df.at[index, 'keyword'] = remove_numbers(df['keyword'][index])
   
    # remove punctuations
    df.at[index, 'text'] = remove_punctuations(df['text'][index])
    df.at[index, 'keyword'] = remove_punctuations(df['keyword'][index])
   
    # removing stop words
    df.at[index, 'text'] = remove_stop_words(df, index, 'text')
    df.at[index, 'keyword'] = remove_stop_words(df, index, 'keyword')
    
    # lower casing the words
    df.at[index, 'text'] = df['text'][index].lower()
    df.at[index, 'keyword'] = df['keyword'][index].lower()
    
    # removing unwanted white spaces
    df.at[index, 'text'] = ' '.join(df['text'][index].split())
    df.at[index, 'keyword'] = ' '.join(df['keyword'][index].split())
   
    # create stems
    df.at[index, 'text'] = stem_words(df['text'][index])
    df.at[index, 'keyword'] = stem_words(df['keyword'][index])
    

In [None]:
df

In [None]:
df.info()

# We are combining the keywords with tweets and creating a new column

In [None]:
df['text_with_keywords'] = df['text'] + ' ' + df['keyword']

# Dropping the text column as it is no longer needed

In [None]:
df.drop('text', inplace=True, axis=1)

# Dropping the keyword column as it is no longer needed

In [None]:
df.drop('keyword', inplace=True, axis=1)

In [None]:
df

# Separating the dataset into training and testing dataframes

In [None]:
train = df.iloc[0:7612, ]
test = df.iloc[7613:10876, ].drop('target',axis=1)

In [None]:
train

In [None]:
test

# Model Training

# Splitting the training and testing features

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop('target', axis=1)
y = train['target']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
y_validation

# Creating word vectors

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train_bow = cv.fit_transform(X_train['text_with_keywords']).toarray()
X_validation_bow = cv.transform(X_validation['text_with_keywords']).toarray()

In [None]:
X_train_bow.shape, X_validation_bow.shape

In [None]:
X_train_bow.shape

In [None]:
X_validation_bow.shape

# Testing the data on Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb_model = gnb.fit(X_train_bow, y_train)

In [None]:
y_pred_validation = gnb_model.predict(X_validation_bow)

# Checking the accuracy scores, confusion matrix and f1 score on  Gaussain NB results 

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
accuracy_score(y_validation, y_pred_validation)

In [None]:
confusion_matrix(y_validation, y_pred_validation)

In [None]:
f1_score(y_validation, y_pred_validation)

# Testing the data on Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestClassifier()

model_randforest = rand_forest.fit(X_train_bow, y_train)

y_pred = model_randforest.predict(X_validation_bow)

In [None]:
y_pred

# Checking the accuracy scores, confusion matrix and f1 score on Random Forest Classifier results

In [None]:
accuracy_score(y_validation, y_pred)

In [None]:
confusion_matrix(y_validation, y_pred)

In [None]:
f1_score(y_validation, y_pred)

# Making Bi-Gram features and training and testing on CatBoost Classifier

In [None]:
cv = CountVectorizer(ngram_range = (1, 2), max_features=7000)

X_train_bow_bigram = cv.fit_transform(X_train['text_with_keywords']).toarray()
X_validation_bow_bigram = cv.transform(X_validation['text_with_keywords']).toarray()

In [None]:
X_test_bow_bigram = cv.transform(test['text_with_keywords']).toarray()

# Testing the data on Cat Boost Classifier

In [None]:
import catboost as cb

cat_boost = cb.CatBoostClassifier(loss_function='CrossEntropy', iterations=10000, od_wait=100, od_type='Iter')

model_cb = cat_boost.fit(X_train_bow_bigram, y_train)

# Prediciting on validation data

In [None]:
y_predict_cb = model_cb.predict(X_validation_bow)

# Predicting on test data

In [None]:
y_predict_cb_test = model_cb.predict(X_test_bow_bigram)

In [None]:
y_predict_cb_test.shape

In [None]:
y_predict_cb.shape

# Checking the accuracy scores, confusion matrix and f1 score on Cat Boost Classifier results of validation data

In [None]:
accuracy_score(y_validation, y_predict_cb)

In [None]:
confusion_matrix(y_validation, y_predict_cb)

In [None]:
f1_score(y_validation, y_predict_cb)

# Making word vectors using Bi Gram model and testing on Random Forest Classifier

In [None]:
model_randforest = rand_forest.fit(X_train_bow_bigram, y_train)

y_pred = model_randforest.predict(X_validation_bow_bigram)

In [None]:
X_validation_bow_bigram.shape

# Checking the accuracy scores, confusion matrix and f1 score on Random Forest Classifier for Bi gram model results

In [None]:
accuracy_score(y_validation, y_pred)

In [None]:
f1_score(y_validation, y_pred)

In [None]:
confusion_matrix(y_validation, y_pred)

In [None]:
y_pred_test = model_randforest.predict(X_test_bow_bigram)

In [None]:
y_pred_test.shape

In [None]:
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sample_submission['target'] = pd.DataFrame(y_predict_cb_test, columns=['target'])

In [None]:
sample_submission.to_csv('sample_submission5.csv', index=False)

In [None]:
df.to_csv('nlp_tweets.csv')

In [None]:
test

In [None]:
test_df