In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize,RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.metrics import f1_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This code is to predict whether the tweet is a natural disaster tweet or not. Normally a human mind decides this by using some pre-fed words and phrases and their intensity. For Eg. if someone says "shock" we consider it as a state of mind but when someone says "Electric Shock" we perceive it differently due to its intensity.

Thus for the machine to understand this we need to feed in that intelligence, especially those pre-fed words using which it can decide as to what is the tweet's classification type.
<br>
Coming to the code, we will complete the notebook in phases :
<br>
Phase 1 - Cleaning and Preprocessing<br>
Phase 2 - Model Building<br>
Phase 3 - Model Testing
<br><br>
PHASE 1 <br>
The data is cleaned and processed by:
* Removing Nulls
* Removing Duplicates
* Checking and Formatting Column DataTypes
* Reshaping the data if needed
* Word Tokenization and Count Vector
* Tweet Understanding using the vector
* New Dataset creation
* Sanity Checks on the new dataset
<br>

PHASE 2<br>
The model used to classify will be the Logistic Regression.
<br><br>
PHASE 3<br>
The model is run on test data and evaulated for its primary metric.<br><br>

In Tweet Understanding we find the number of disaster words in the tweet using a pre-fed dataset and make a new column as our feature vector.

In [None]:
# Import the tweet data

tweet_data_train=pd.read_csv('../input/nlp-getting-started/train.csv')
tweet_data_test=pd.read_csv('../input/nlp-getting-started/test.csv')
tweet_data_train.head()

In [None]:
tweet_data_train.info()

In [None]:
# This function will contain the entire code to create the feature column from the existing.

disaster_words=['forest','fire','forest fire','earthquake',
                'landslide','typhoon','hurricane','attack',
                'ablaze','rains','avalanche','rescue','help',
                'hurt','god','hell','died','injured','succumbed',
                'mayhem','torrential','devastation','terror',
                'stuck','storm','unpleasant','havoc','terrorist',
                'tsunami','wildfire','hailstorm','snowfall',
                'sinkhole','pelting','war','riot','weapon',
                
                'forests','fires','forest fires','earthquakes',
                'landslides','typhoons','hurricanes','attacks',
                'ablazes','rain','avalanches','rescues','terrors'
                ,'storms','terrorists','tsunamis','wildfires',
                'hailstorms','snowfalls','sinkholes','peltings',
                'riots','weapons',
                
               '#forest','#fire','#forest fire','#earthquake',
                '#landslide','#typhoon','#hurricane','#attack',
                '#ablaze','#rains','#avalanche','#rescue','#help',
                '#hurt','#god','#hell','#died','#injured','#succumbed',
                '#mayhem','#torrential','#devastation','#terror',
                '#stuck','#storm','#unpleasant','#havoc','#terrorist',
                '#tsunami','#wildfires','#wildfire','#hailstorm','#snowfall',
                '#sinkhole','#pelting','#war','#riot','#weapon'
               
               '#forests','#fires','#forest fires','#earthquakes',
                '#landslides','#typhoons','#hurricanes','#attacks',
                '#ablazes','#rain','#avalanches','#rescues','#terrors'
                ,'#storms','#terrorists','#tsunamis','#wildfires',
                '#hailstorms','#snowfalls','#sinkholes','#peltings',
                '#riots','#weapons']
keywords=list(tweet_data_train['keyword'])+list(tweet_data_test['keyword'])
disaster_words+=keywords
disaster_words=list(set(disaster_words))
def make_feature_col(x):
    
    x['Number of Disaster Words']=pd.Series()
    tokenizer = RegexpTokenizer(r'\w+')
    to_be_removed = set(stopwords.words('english'))
    c=0
    for i in range(len(x)):
        
        tokens=tokenizer.tokenize(x['text'][i].lower())
        new_tokens=[word for word in tokens if not word in to_be_removed]
        for j in range(len(new_tokens)):
            if(new_tokens[j] in disaster_words):
                c+=1
        x['Number of Disaster Words'].iloc[i]=c
        c=0
    return x

In [None]:
! pip install wordcloud

In [None]:
from wordcloud import WordCloud
unique_string=(" ").join(disaster_words[1:])
wordcloud = WordCloud(width = 1000, height = 500).generate(unique_string)
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
tweet_data_train=make_feature_col(tweet_data_train)
tweet_data_train.head()

In [None]:
new_tweet_data_train=tweet_data_train.loc[:,['id','Number of Disaster Words','target']]

In [None]:
new_tweet_data_train

In [None]:
# Test Data

tweet_data_test=make_feature_col(tweet_data_test)
tweet_data_test=tweet_data_test.loc[:,['id','Number of Disaster Words']]
tweet_data_test.head()


In [None]:
# Model Building

X=new_tweet_data_train['Number of Disaster Words']
X=np.array(X).reshape(-1,1)
y=new_tweet_data_train['target']


dtc = DecisionTreeClassifier(max_depth=10000)
rfc = RandomForestClassifier()
dtc.fit(X, y)
rfc.fit(X,y)

print('Accuracy of Decision Tree Classifier on training set: {:.2f}'
     .format(dtc.score(X, y)))
print('Accuracy of Random Forest Classifier on training set: {:.2f}'
     .format(rfc.score(X, y)))

XTest=tweet_data_test['Number of Disaster Words']


In [None]:
pred_dtc=dtc.predict(np.array(XTest).reshape(-1,1))
print(pred_dtc)

pred_rfc=rfc.predict(np.array(XTest).reshape(-1,1))
print(pred_rfc)

In [None]:
len(pred_dtc)
len(pred_rfc)

In [None]:
output_dtc=pd.DataFrame({'id':tweet_data_test['id'],'target':pred_dtc})
output_rfc=pd.DataFrame({'id':tweet_data_test['id'],'target':pred_rfc})

In [None]:
output_rfc.to_csv('submission.csv',index=False)