In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_df.head()

In [None]:
train_df.shape, test_df.shape

## Dataset Properties

In [None]:
train_df.dtypes

In [None]:
train_df.describe().transpose()

In [None]:
train_df.isnull().sum()

In [None]:
X = train_df.drop(["id"],axis=1)
X.head()

In [None]:
print(train_df["id"].nunique())
print(train_df["keyword"].nunique())
print(train_df["location"].nunique())
print(train_df["target"].unique())
print(train_df["text"].nunique())

##  Data PreProcessing

### Filling Missing Value with Most Frequent Value

In [None]:
key = X["keyword"].value_counts().index[0]
print("most frequent word in keyword is :",key)
loc = X["location"].value_counts().index[0]
print("most frequent word in location is :",loc)

In [None]:
train_df['keyword'] = train_df['keyword'].fillna(train_df['keyword'].value_counts().idxmax())
train_df['location'] = train_df['location'].fillna(train_df['location'].value_counts().idxmax())
train_df.isnull().sum()

In [None]:
X_new = train_df.drop(["target"],axis=1)
X_new.head()

In [None]:
y = train_df["target"]
y.shape

In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
import csv
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier 
import nltk
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
#not a disaster tweet
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.target == 0].text))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear');

In [None]:
# Disaster Tweet
plt.figure(figsize = (20,20))
Wc = WordCloud(max_words = 500 , width = 1600 , height = 800).generate(" ".join(train_df[train_df.target == 1].text))
plt.axis("off")
plt.imshow(Wc , interpolation = 'bilinear');

### Test Data PreProcessing

In [None]:
test_df.isnull().sum()

In [None]:
test_df['keyword'] = test_df['keyword'].fillna(test_df['keyword'].value_counts().idxmax())
test_df['location'] = test_df['location'].fillna(test_df['location'].value_counts().idxmax())
test_df.isnull().sum()

## Text PreProcessing

### Removing extra punctuations like , ? / @ # ! * ~ etc all

In [None]:
X_new.replace("[^a-zA-Z]", " ",regex = True, inplace = True)
X_new.head()

###  Converting Upper case to Lower case so all string data are same

In [None]:
X_new = X_new.drop(["id"],axis=1)
X_new.head()

In [None]:
# Converting into lower case
for i in X_new.columns:
    X_new[i] = X_new[i].str.lower()
X_new.head(1)

### Remove StopWords

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
X_new['keyword'].apply(lambda x: [item for item in x if item not in stop])
X_new['location'].apply(lambda x: [item for item in x if item not in stop])
X_new['text'].apply(lambda x: [item for item in x if item not in stop])
print(X_new.shape)

## Text PreProcessing in Test Data

In [None]:
ori_test = test_df.drop(["id"],axis=1)

In [None]:
# Removing extra punctuations like , ? / @ # ! * ~ etc all
ori_test.replace("[^a-zA-Z]", " ",regex = True, inplace = True)

# Converting Upper case to Lower case so all string data are same
for i in ori_test.columns:
    ori_test[i] = ori_test[i].str.lower()

# Remove StopWords
ori_test['keyword'].apply(lambda x: [item for item in x if item not in stop])
ori_test['location'].apply(lambda x: [item for item in x if item not in stop])
ori_test['text'].apply(lambda x: [item for item in x if item not in stop])
print(ori_test.shape)

### Splitting data

In [None]:
# Adding Keyword and Text because both are important for our prediction 

# IN TRAIN DATASET
X_new["sentence"] = X_new['keyword'] + " " + X_new['text']
train_text = np.array(X_new["sentence"])
print(train_text[0])
print(f" train_text type : '{type(train_text)}' ")

# IN TEST DATASET 
ori_test["sentence"] = ori_test['keyword'] + " " + ori_test['text']
test_text = np.array(ori_test["sentence"])
print(test_text[0])

In [None]:
print(train_text[0])

In [None]:
test_text[0]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_text,y,test_size=0.25,random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

## CountVectorizer change text to vector and help to use BagOfWords

In [None]:
# encoding essay attribute using count vectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(x_train)

keyword = vectorizer.get_feature_names()
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
X_test_new = vectorizer.transform(test_text)

print(" after encoding in bow the size of keyword:",len(keyword))
print(" train feature --",X_train.shape,y_train.shape)
print("test feature --",X_test.shape,y_test.shape)
print("new test feature --",X_test_new.shape)

In [None]:
print("data before vectorization :\n",x_train[0])
print("\n")
print("data after vectorization in vector form : \n",X_train[0])

## Predict

In [None]:
rc = RandomForestClassifier(max_depth = 400 ,random_state=0, n_estimators = 300)
rc.fit(X_train,y_train) # fit in model

pred1 = rc.predict(X_test) # predict data 

In [None]:
# Metric
matrix = confusion_matrix(y_test,pred1)
print(matrix)
score = accuracy_score(y_test,pred1)
print(score)
report = classification_report(y_test,pred1)
print(report)
prediction1 = rc.predict(X_test_new)
prediction1

## Save Result

In [None]:
data = {'id':test_df["id"],'target':prediction1}
output = pd.DataFrame(data, columns = ['id','target'])
output.index = test_df.index

output.to_csv("submission.csv", index = False)    

# the csv file will be saved locally on the same location where this notebook is located.
a = pd.read_csv("submission.csv")
a