In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print("Number of positive tweets ",len(train[train["target"] == 0]))
print("Number of negative tweets ",len(train[train["target"] == 1]))

In [None]:
import nltk
import string
import re
def preprocess(tweet):
    
    tweet = re.sub(r'^RT[\s]+', '', tweet) # remove Retweet text
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hyperlinks
    tweet = re.sub(r'#', '', tweet) #remove hashtags
    
    #Tokenizing
    tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True)
    tokenized_tweet = tokenizer.tokenize(tweet)
    
    #Removing stopwords and punctuation. Stemming the remaining words.
    stemmer = nltk.stem.PorterStemmer()
    english_stopwords = nltk.corpus.stopwords.words('english')
    processed_tweet = []
    for word in tokenized_tweet:
        if(word not in english_stopwords and word not in string.punctuation):
            processed_tweet.append(stemmer.stem(word))
            
    return processed_tweet

In [None]:
#Verifying 
print("Original Tweet: ",train["text"][5])
print("Processed Tweet: ",preprocess(train["text"][5]))

In [None]:
#Building frequency dictionary of every unique word in the dataset
def freq_builder(tweets,labels):
    freq_dict = {}
    labels_list = np.squeeze(labels).tolist()
    for tweet,l in zip(tweets,labels_list):
        for word in preprocess(tweet):
            pair = (word,l)
            if(pair in freq_dict):
                freq_dict[pair] += 1
            else:
                freq_dict[pair] =  1
    return freq_dict

In [None]:
#Creating the frequency dictionary using the dataset
frequency = freq_builder(train["text"],train["target"])
print(list(frequency.items())[5:10])

In [None]:
#Extracting features from the frequency dictionary for the dataset. For each tweet first feature will be
#number of words related to disaster in the tweet and second feature will be number of words not related 
#to disaster.
def feature_extraction(tweet,freq_dict):
    tokenized_tweet = preprocess(tweet)
    f = np.zeros((1,3))
    f[0,0] = 1 #bias term
    for word in tokenized_tweet:
        f[0,1] += freq_dict.get((word,1.0),0)
        f[0,2] += freq_dict.get((word,0.0),0)
        
    return f

In [None]:
#Extracting features from dataset
X_train = np.zeros((len(train["target"]),3))
for i in range(len(train["target"])):
    X_train[i,:] = feature_extraction(train["text"][i],frequency)
Y_train = train["target"]

print(X_train[10:15])
print(Y_train[10:15])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

#lr = LogisticRegression(penalty='l2',C = 1.0, solver = 'lbfgs', max_iter=100)
#result = cross_validate(lr, X_train, Y_train, scoring = ('accuracy','f1'), cv = 3, return_train_score=True)
svm = SVC(C=1.0,kernel='rbf')
result = cross_validate(svm,X_train,Y_train,scoring = ('accuracy','f1'), cv = 3, return_train_score=True)
print("Train Accuracy Score: ",result["train_accuracy"])
print("Test Accuracy Score: ",result["test_accuracy"])
print("Train F1 Score: ",result["train_f1"])
print("Test F1 Score: ",result["test_f1"])

In [None]:
#Generating the Submission file
X_test = np.zeros((len(test["text"]),3))
#lr = LogisticRegression(penalty='l2',C = 1.0, solver = 'lbfgs', max_iter=100).fit(X_train,Y_train)
model = SVC(C=1.0,kernel='rbf').fit(X_train,Y_train)
for i in range(len(test["text"])):
    X_test[i,:] = feature_extraction(test["text"][i],frequency)
Y_test = model.predict(X_test)
submission_df = pd.DataFrame({'id' : test["id"],'target' : Y_test})
submission = submission_df.to_csv('submission.csv',index=False)