In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/kaggle/train.csv")
df2 = pd.read_csv("/content/drive/MyDrive/kaggle/test.csv")

In [None]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [None]:
df['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [None]:
id = df2['id'].tolist()
df.drop(['keyword', 'location', 'id'], axis=1, inplace=True)
df2.drop(['keyword', 'location', 'id'], axis=1, inplace=True)

In [None]:
import re

def question_preprocessing(text):
    ''' Pre process and convert questions to a list of words '''
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r'(\d)\s+(\d)', r'\1\2', text)

    return text

In [None]:
df['text'] = df['text'].apply(question_preprocessing)
df2['text'] = df2['text'].apply(question_preprocessing)

In [None]:
df['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to helter in place are bei...
3    13000 people receive wildfires evacuation orde...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
  
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

txt = []
for i in df['text'].tolist(): 
  word_tokens = word_tokenize(i)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  lemm = [lemmatizer.lemmatize(j) for j in filtered_sentence]
  txt.append(" ".join(lemm))

df['text'] = txt

txt = []
for i in df2['text'].tolist(): 
  word_tokens = word_tokenize(i)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  lemm = [lemmatizer.lemmatize(j) for j in filtered_sentence]
  txt.append(" ".join(lemm))

df2['text'] = txt
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,text,target
0,deed reason earthquake may allah forgive u,1
1,forest fire near la ronge sask canada,1
2,resident asked helter place notified officer e...,1
3,13000 people receive wildfire evacuation order...,1
4,got sent photo ruby alaska smoke wildfire pour...,1


In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_t, y_tr, y_t = train_test_split(df['text'].values, df['target'].values, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer()
tfidf.fit(X_tr)
X_tr = tfidf.transform(X_tr)
X_t = tfidf.transform(X_t)

# rf = RandomForestClassifier()
# rf.fit(X_tr.todense(), y_tr)
# l = LogisticRegression()
# l.fit(X_tr.todense(), y_tr)

In [None]:
from sklearn.metrics import accuracy_score

p = l.predict(X_t.todense())
p2 = rf.predict(X_t.todense())
print("Logistic : ", accuracy_score(y_t, p))
print("Random Forest : ",accuracy_score(y_t, p2))



Logistic :  0.7892317793827971
Random Forest :  0.7892317793827971


In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# import numpy as np

# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
              
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(X_tr, y_tr)
# print(rf_random.best_params_)

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_tr.todense(), y_tr)
p2 = naive_bayes_classifier.predict(X_t.todense())
print("Naive : ", accuracy_score(y_t, p2))



Naive :  0.7971109652002626




In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(df['text'].values)
X_tr = tfidf.transform(df['text'].values)
X_t = tfidf.transform(df2['text'].values)
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_tr.todense(), df['target'].tolist())
p2 = naive_bayes_classifier.predict(X_t.todense())



In [None]:
sol = pd.DataFrame(list(zip(id, list(p2))), columns =['id', 'target'])
sol.head()
sol.to_csv('submission.csv', index=False)