## Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import re

## Dataset Preprocessing & Cleaning

In [None]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
test_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

We can see that 2533 are missing which is huge amount which will make huge prediction errors, so we can drop location column
And fill keyword column with Most Frequent Value

In [None]:
train_df.drop(["location"], axis=1, inplace=True)

test_df.drop(["location"], axis=1, inplace=True)

train_df['keyword'] = train_df['keyword'].fillna(train_df['keyword'].value_counts().idxmax())

test_df['keyword'] = test_df['keyword'].fillna(test_df['keyword'].value_counts().idxmax())

In [None]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

In [None]:
sns.countplot(x = "target", data = train_df)

In [None]:
train_df.drop(["id"], axis=1, inplace=True)
test_df.drop(["id"], axis=1, inplace=True)

## Text Preprocessing
1. Remove extra punctuations
2. Convert Upper case to Lower case
3. Remove StopWords

In [None]:
## Remove extra punctuations
train_df["text"].replace("[^a-zA-Z]", " ",regex = True, inplace = True)
test_df["text"].replace("[^a-zA-Z]", " ",regex = True, inplace = True)

## Convert Upper case to Lower case
train_df["text"] = train_df["text"].str.lower()
test_df["text"] = test_df["text"].str.lower()

## Remove stop words
stop_words = stopwords.words('english')

train_df['text'].apply(lambda x: [item for item in x if item not in stop_words])
test_df['text'].apply(lambda x: [item for item in x if item not in stop_words])


****It is clean now. Now let's see some word cloud visualizations of it.****

In [None]:
from wordcloud import WordCloud

plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.target == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

In [None]:
from wordcloud import WordCloud

plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.target == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

## Converting data to numpy array

In [None]:
X = train_df.drop(['target'],axis = 1)
Y = train_df['target']

X["sentence"] = X['keyword'] + " " + X['text']
Xtrain = np.array(X["sentence"])

test_df["sentence"] = test_df['keyword'] + " " + test_df['text']
Xtest = np.array(test_df["sentence"])

## Data Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer, HashingVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(Xtrain)

keyword = vectorizer.get_feature_names()
x_train = vectorizer.transform(Xtrain)
x_test = vectorizer.transform(Xtest)

## Voting Ensemble
1. LogisticRegression
2. ComplementNB
3. SVC

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression()
model2 = ComplementNB()
model3 = SVC()

In [None]:
final_model = VotingClassifier(estimators=[('lOG', model1), ('NB', model2), ('SVC',model3)], voting='hard')

final_model.fit(x_train, Y)

## Model Prediction

In [None]:
pred = final_model.predict(x_test)

## filling submission.csv

submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

submission["target"] = pred
submission.to_csv("submission.csv", index=False)

submission.head()