In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualisation
import matplotlib.pyplot as plt #plotting data
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# reading data
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train.head()

In [4]:
train.isnull().sum()

In [5]:
plt.figure(figsize = (8, 6))
sns.heatmap(train.isna(), yticklabels = False, cbar = False, cmap = 'Reds')
plt.title("Missing values", fontsize = 14)
plt.xticks(rotation = 35, fontsize = 12)
plt.show()

In [6]:
plt.pie(train.groupby('target').count()['id'], explode=None, labels=['Not Disaster (57%)', 'Disaster (43%)'],colors=['lightcoral','lightskyblue'])
plt.title('Target Distribution in Training Set', fontsize=13)
plt.show()

In [7]:
(train.location.value_counts())[:10]

In [8]:
# data cleaning
train = train.drop(['id','location','keyword'],1)
train.head()

In [9]:
# basic model using count vectoriser. Can use tf-idf later for better analysis
count_vectorizer = feature_extraction.text.CountVectorizer()
X = count_vectorizer.fit_transform(train["text"])
y = train['target']

# we use transform and not fit transform for test vectors
# this is because we wish to normalise the test data based on training data
test_vectors = count_vectorizer.transform(test["text"])


In [10]:
# split the training data to check which model works better
# test data on the best model.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


In [11]:
# ridge classsifier model
clf = linear_model.RidgeClassifier()

model = clf.fit(x_train, y_train)
print("train accuracy:",model.score(x_train, y_train),"\n","test accuracy:",model.score(x_test,y_test))

In [12]:
# logistic regression
lr = linear_model.LogisticRegression(max_iter=2000,penalty='l2')
model1=lr.fit(x_train, y_train)
print("train accuracy:",model1.score(x_train, y_train),"\n","test accuracy:",model1.score(x_test,y_test))


In [13]:
# decision tree classifier
dt=DecisionTreeClassifier()
model2=dt.fit(x_train, y_train)
print("train accuracy:",model2.score(x_train, y_train),"\n","test accuracy:",model2.score(x_test,y_test))


In [14]:
# random forest classifier
rt = RandomForestClassifier(random_state=123)
model3 = rt.fit(x_train,y_train)
print("train accuracy:",model3.score(x_train, y_train),"\n","test accuracy:",model3.score(x_test,y_test))

It is observed that logistic regression and ridge classifier work best for the given dataset. Hence we make final predictions using logical regression model.

In [15]:
clf.fit(X, y)

sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv("submission.csv", index=False)

In [16]:
lr.fit(X, y)

sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = lr.predict(test_vectors)
sample_submission.head()
sample_submission.to_csv("submission.csv", index=False)