In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
import re

nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

In [None]:
from sklearn.metrics import accuracy_score, zero_one_loss, classification_report

### **Load Data**

In [None]:
data_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
data_df.head()

In [None]:
print("Total rows in train data: ",data_df.shape[0])
print("Total columns in train data: ", data_df.shape[1])
print("-"*30)
print("Total rows in test data: ", test_df.shape[0])
print("Total columns in test data: ", test_df.shape[1])

In [None]:
print(data_df.isna().sum())
print("-"*30)
print(test_df.isna().sum())

In [None]:
data_df.groupby(data_df.target).count().text

In [None]:
sns.countplot(data_df.target, data=data_df)

### **Trim of extra rows with target as '0' to avoid un-fair training**
This un-balanced dataset might cause sluggish and biased predictions.
We need equal amount of each target-rows for better performance.
***But loosing 1000 rows is a huge loss, we can use these rows in validation***

In [None]:
data_df = data_df.groupby('target').apply(lambda x: x.sample(3271)).reset_index(drop=True)
sns.countplot(data_df.target, data=data_df)

### **Split Data into Train and Validation Data**

In [None]:
data_df = data_df.sample(frac=1).reset_index()
train_df, valid_df = data_df.iloc[:5233, :], data_df.sample(frac=1).iloc[5234:,:]

In [None]:
print("Total rows and columns in train data is: ", train_df.shape)
print("-"*30)
print("Total rows and columns in validate data is: ",valid_df.shape )

In [None]:
class DisasterTweetModel:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.model = PassiveAggressiveClassifier(max_iter=500, tol=1e-3)
    
    def get_clean_text(self, df):
        return df.text.apply(self.clean_tweet)

    def fit(self, x, y):
        x = np.array([self.clean_tweet(text) for text in x])
        x_vector = self.vectorizer.fit_transform(x)
        self.model.fit(x_vector, y)
        print("Training Finished")
    
    def predict(self,ids, x):
        x = np.array([self.clean_tweet(text) for text in x])
        x_vector = self.vectorizer.transform(x)
        y_pred = self.model.predict(x_vector).reshape((x.shape[0], 1))
        ids = ids.reshape((ids.shape[0], 1))
        array = np.concatenate((ids, y_pred), axis=1)
        return pd.DataFrame(array, columns=['id','target'])
    
    def clean_tweet(self, t):
        document = []
        for token in nlp(t):
            if not token.is_stop and token.text not in string.punctuation and token.pos_ not in ["NOUN", "VERB", "ADJ", "ADV"]:
                text = re.sub(r'^https?:\/\/.*[\r\n]*', '', token.lemma_, flags=re.MULTILINE)
                text = text.replace('#', '')
                document.append(text)
        return " ".join(document)
            
    
    def validate(self ,ids, x, y):
        y_frame = self.predict(ids, x)
        y_pred = y_frame.target.values
        accuracy = accuracy_score(y, y_pred)
        print("Accuracy of model is: ", accuracy)
        print("-"*30)
        loss = zero_one_loss(y, y_pred)
        print("Loss of model is: ", loss)
        print("-"*30)
        cr = classification_report(y, y_pred)
        print("Classification Report of model is: ", cr)
        return accuracy, loss, cr
        
        
    
        

In [None]:
model = DisasterTweetModel()

In [None]:
model.fit(train_df.text.values, train_df.target.values)

In [None]:
ac, los, cr = model.validate(valid_df.id.values, valid_df.text.values, valid_df.target.values)