In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import some of the important libraries.

In [None]:
import numpy as np
import pandas as pd

import nltk
import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.model_selection import train_test_split

from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

#import accuracy
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression

Load the dataset

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

Make a copy , Trying not to make any changes in original dataset

In [None]:
tr = train.copy()

In [None]:
tr.shape

In [None]:
tr.head()

Lets Drop 'id' column as it is not usefull for analysis

In [None]:
tr.drop('id', axis = 1, inplace = True)

In [None]:
# Store Test_id , it will be require at the time of submission

test_id = test['id']

Check for duplicate records

In [None]:
duplicates_record = tr[tr.duplicated(['text'], keep=False)]
duplicates_record

In [None]:
duplicates_record.shape

Around 179 duplicate records are found. Now lets consider (Text , target) and only keep the first record of duplicate record and remove remaining.

In [None]:
tr.drop_duplicates(subset = ['text','target'], keep = 'first', inplace = True, ignore_index = True)

Now lets check again

In [None]:
duplicates_record = tr[tr.duplicated(['text'], keep=False)]
duplicates_record.head(6)

Still we see there are various duplicate records are available but with one difference, this time for same tweet , one time target is 0 and for another it is 1. This seems to be corrupted records and ML model may get confuse, so its better to remove these records.

In [None]:
tr.drop_duplicates(subset = ['text'], keep = False, inplace = True, ignore_index = True)

Now Lets Check if there is any missing values.

In [None]:
tr.isna().sum()

Keyword has 56 missing value , we will deal with those in sometime, and Location has many missing values also for this analysis , i am considering Location is not an important feature, So lets move on.

In [None]:
tr['keyword'].value_counts()

In [None]:
tr['location'].nunique()

In [None]:
tr[~tr['location'].isna()]

Now lets look at target variable.

In [None]:
tr['target'].value_counts()

In [None]:
sns.countplot(tr['target']);

Seems to be very much balanced dataset.

Lets seprate out target variable.

In [None]:
Y = tr['target']
tr.drop('target', axis = 1, inplace = True)

In [None]:
tr.shape

In [None]:
test.shape

In [None]:
test.head()

Here, i am combining the train and test dataset, before converting word to vectors.
The reason is if we do it separately then there is a good chance that the vector we will get in train set and test set will not be same and while predicting the test tweet the train and test features should be same.

In [None]:
tr = pd.concat([tr,test], axis = 0)

In [None]:
tr.shape

Now lets do some text cleaning

In [None]:
alpha = [' ','a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Here is the 2 method (Stemmer and Lemmatizer) to bring the word to its root word.

In [None]:
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [None]:
tr['keyword'].nunique()

Lets not remove missing value present in Keyword, instead we can replace the NaN with Text 'missing'. in order to avoid data loss.

In [None]:
tr['keyword'] = np.where(tr['keyword'].isna(),'missing', tr['keyword'])

In [None]:
tr['keyword'].unique()

Now lets clean the keyword, as we see in some of the keyword '%20' is added not sure why and how, but lets remove those.

In [None]:
def cleanKeyword(text):
    
    text = text.lower()   # to convert to all lower case text.
    text = text.replace('%20',' ')   # Remove '%20' if present.
    text = ' '.join([ps.stem(word) for word in text.split(' ')]) # To bring the word to its root word.
    
    return text

In [None]:
tr['clean_keyword'] = tr['keyword'].apply(cleanKeyword)

In [None]:
tr['clean_keyword'].unique()

In [None]:
tr['clean_keyword'].nunique()

Much cleaner and shorter, Earlier there were total 221 unique values and now after cleaning it reduced to 167.

In [None]:
stopWords = stopwords.words('english')

In [None]:
punct = string.punctuation
punct

In [None]:
#stopWords

Now lets clean the Text variable.

In [None]:
def cleanText(text):
    
    text = text.lower() # change to lower case
    text = ''.join([char for char in text if char in alpha]) # remove anything that is not an alphabet.
    text = ' '.join([ps.stem(word) for word in text.split(' ') if ((word not in stopWords) & (len(word)>1))])  # Bring the word to its root word.
    
    return text

In [None]:
tr['text_clean'] = tr['text'].apply(cleanText)

In [None]:
tr.head()

'clean_keyword' feature alone do not contribute much in analysis, so lets combine it with 'text_clean' and create a brand new feature 'clean_tweet'.

In [None]:
tr['clean_tweet'] = tr['text_clean'] + ' ' + tr['clean_keyword']

Now lets apply  word to vector method.

In [None]:
vector = TfidfVectorizer(sublinear_tf=True, max_features=2700)
X = vector.fit_transform(tr['clean_tweet'].values)

Here, for max_features i am taking value as 2700, As i tried with several values from 500 to 3000, and 2700 was giving best accuracy.

Also it is important to select some value for max_features as if it is not selected then the method (TfidfVectorizer) will create feature for each word and you will end up in getting very very very large number of features (in this case it was more tha 25000) and while creating model you may get memory error.

And by giving some values (lets say 2700) to max_features, it will select 2700 most important features.

In [None]:
X_col = vector.get_feature_names()

X_col will give the all the 2700 feature selected.

In [None]:
df = pd.DataFrame.sparse.from_spmatrix(X, columns = X_col)

In [None]:
df.head()

In [None]:
df.shape

Now lets separate out Test and train data.

In [None]:
test = df.iloc[7485:]

In [None]:
test.reset_index(drop = True , inplace = True)

In [None]:
test.head()

In [None]:
train_df = df.iloc[:7485]

In [None]:
test.shape

All looks good, lets create Model

In [None]:

cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model,train_df, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: ',((scores).mean()))

80.4 % accuracy not that bad.....

In [None]:
model.fit(train_df, Y)

Now lets Use Count Vectorizor and see if it is better than Tfidf

In [None]:
count_vector = CountVectorizer(encoding='utf-8', max_features=2500)
X_count = count_vector.fit_transform(tr['clean_tweet'].values)

In [None]:
X_count_col = count_vector.get_feature_names()

In [None]:
train_mat = pd.DataFrame.sparse.from_spmatrix(X_count, columns = X_count_col)

In [None]:
train_mat.head()

In [None]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model,train_df, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: ',((scores).mean()))

Exactly same......

In [None]:
model.fit(train_df, Y)

## **Submission**

In [None]:
y_pred = model.predict(test)

In [None]:
len(y_pred)

In [None]:
# Create a submisison dataframe and append the relevant columns

submit=pd.DataFrame()
submit['id'] = test_id
submit['target'] = y_pred # our model predictions on the test dataset
submit.head()

In [None]:
len(submit) == len(test)

In [None]:
# Convert submisison dataframe to csv for submission to csv 
# for Kaggle submisison
submit.to_csv('../Disaster tweet.csv', index=False)
print('Submission CSV is ready!')

In [None]:
submissions_check = pd.read_csv("../Disaster tweet.csv")
submissions_check.head()