# Importing the necessary libraries

In [1]:
# Load the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Importing the dataset

In [2]:
# Load the train and test files
train = pd.read_csv(r"C:\Users\KIIT\Desktop\T&T lab\train.csv", sep='\t', encoding='utf-8')
test = pd.read_csv(r"C:\Users\KIIT\Desktop\T&T lab\test.csv", sep='\t', encoding='utf-8')

In [3]:
train.head()

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0


In [4]:
import nltk #Import NLTK ---> Natural Language Toolkit
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
train['text'].loc[0]

'Get the latest from TODAY Sign up for our newsletter  No one ever truly gets over losing a loved one, and Blake Shelton is no exception. He was just 14 when his older brother Richie died on Nov. 13, 1990. And, as Shelton noted in a tweet Monday, "It changed my life forever."  Richie was 24 when he died in a car accident in the Sheltons\' home state of Oklahoma. Two years ago, Shelton sent out a message for the 25th anniversary of his loss:  Richie, who was Blake\'s half-brother (they shared a mother), was a passenger in a car that collided with a school bus in Ada, south of Oklahoma City.  Richie, driver Redena McManus and a 3-year-old boy, Christopher McManus, all died during or shortly after the collision, while the bus driver and passengers were uninjured, according to police reports.  The accident has clearly remained with Blake, who told 60 Minutes in 2014, "I remember picking up the phone to call him a week after he was dead, to tell him something. I was picking up the phone to 

In [6]:
train.text.loc[ : 5].values.tolist()[0]

'Get the latest from TODAY Sign up for our newsletter  No one ever truly gets over losing a loved one, and Blake Shelton is no exception. He was just 14 when his older brother Richie died on Nov. 13, 1990. And, as Shelton noted in a tweet Monday, "It changed my life forever."  Richie was 24 when he died in a car accident in the Sheltons\' home state of Oklahoma. Two years ago, Shelton sent out a message for the 25th anniversary of his loss:  Richie, who was Blake\'s half-brother (they shared a mother), was a passenger in a car that collided with a school bus in Ada, south of Oklahoma City.  Richie, driver Redena McManus and a 3-year-old boy, Christopher McManus, all died during or shortly after the collision, while the bus driver and passengers were uninjured, according to police reports.  The accident has clearly remained with Blake, who told 60 Minutes in 2014, "I remember picking up the phone to call him a week after he was dead, to tell him something. I was picking up the phone to 

# Data Preprocessing

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# create a function to tokenize the data
def preprocess_data(data):
  
  # 1. Tokenization
  tk = RegexpTokenizer('\s+', gaps = True)
  text_data = [] # List for storing the tokenized data
  for values in data.text:
    tokenized_data = tk.tokenize(values) # Tokenize the news
    text_data.append(tokenized_data) # append the tokenized data

  # 2. Stopword Removal

  # Extract the stopwords
  sw = stopwords.words('english')
  clean_data = [] # List for storing the clean text
  # Remove the stopwords using stopwords
  for data in text_data:
    clean_text = [words.lower() for words in data if words.lower() not in sw]
    clean_data.append(clean_text) # Appned the clean_text in the clean_data list
      # 4. tfidf vectorizer --> Term Frequency Inverse Document Frequency
        
  # 3. Stemming

  # Create a stemmer object
  ps = PorterStemmer()
  stemmed_data = [] # List for storing the stemmed data
  for data in clean_data:
    stemmed_text = [ps.stem(words) for words in data] # Stem the words
    stemmed_data.append(stemmed_text) # Append the stemmed text
  

  '''TF-IDF stands for Term Frequency Inverse Document Frequency of records. 
     It can be defined as the calculation of how relevant a word in a series or corpus is to a text. 
     The meaning increases proportionally to the number of times in the text a word appears but is compensated 
     by the word frequency in the corpus (data-set).'''

  '''Term Frequency: In document d, the frequency represents the number of instances of a given word t. 
     Therefore, we can see that it becomes more relevant when a word appears in the text, which is rational. 
     Since the ordering of terms is not significant, we can use a vector to describe the text in the bag of term models. 
     For each specific term in the paper, there is an entry with the value being the term frequency.'''

     # tf(t,d) = count of t in d / number of words in d
  '''Document Frequency: This tests the meaning of the text, which is very similar to TF, in the whole corpus collection. 
     The only difference is that in document d, TF is the frequency counter for a term t, while df is the number of occurrences 
     in the document set N of the term t. In other words, the number of papers in which the word is present is DF.'''

     # df(t) = occurrence of t in documents

  '''Inverse Document Frequency: Mainly, it tests how relevant the word is. 
     The key aim of the search is to locate the appropriate records that fit the demand. 
     Since tf considers all terms equally significant, it is therefore not only possible to use the term frequencies 
     to measure the weight of the term in the paper. First, find the document frequency of a term t by counting the 
     number of documents containing the term.'''

  ''' df(t) = N(t)
      where
      df(t) = Document frequency of a term t
      N(t) = Number of documents containing the term t'''
     # Take the log, idf(t) = log(N/ df(t))
      # tf-idf(t, d) = tf(t, d) * idf(t)
  
  # Flatten the stemmed data

  updated_data = []
  for data in stemmed_data:
    updated_data.append(" ".join(data))

  # TFID Vector object
  tfidf = TfidfVectorizer()
  tfidf_matrix = tfidf.fit_transform(updated_data)

  return tfidf_matrix

In [14]:
# Call the above function on the merged data
train_len = train.shape[0]
merged_data = pd.concat((train.drop('label', axis=1), test.drop('id', axis=1)), axis=0).reset_index().drop('index', axis=1)

In [15]:
# preprocess the merged data
preprocessed_data = preprocess_data(merged_data)

In [16]:
train_data = preprocessed_data[ : train_len]
test_data = preprocessed_data[train_len : ]

# Train-Test Split

In [17]:
# Model selection
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(train_data, train.label, test_size=0.2, random_state = 42)

In [18]:
# Metrics
from sklearn.metrics import accuracy_score

# Modelling

In [19]:
# model
def compute_metrics(data, y_true, model_obj, model):

  # Make predictions
  y_pred = model_obj.predict(data)

  # Compute accuracy
  acc = accuracy_score(y_true = y_true, y_pred = y_pred)

  # Make DataFrame
  metrics = pd.DataFrame(data = np.array([acc]), index=[model], columns=['Accuracy Score'])
  return metrics

# Logistic Regression

In [20]:
# 1. LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

In [21]:
# Model object
lr_reg = LogisticRegressionCV(Cs=20, cv=3, random_state=42)

In [22]:
# fit the model
lr_reg.fit(X_train, y_train)

LogisticRegressionCV(Cs=20, cv=3, random_state=42)

In [23]:
# Compute the Logistic Regression Metrics
lr_metrics =  compute_metrics(X_test, y_test, lr_reg, 'LogisticRegression')

In [24]:
lr_metrics_train =  compute_metrics(X_train, y_train, lr_reg, 'LogisticRegression')

In [25]:
lr_metrics

Unnamed: 0,Accuracy Score
LogisticRegression,0.778557


# Multinomial Naive-Bayes

In [26]:
#2. Naive Bayes
from sklearn.naive_bayes import MultinomialNB


In [27]:
# Model Object
mnb = MultinomialNB(alpha=0.0)

In [28]:
# Fit the object
mnb.fit(X_train, y_train)

MultinomialNB(alpha=0.0)

In [29]:
# Compute metrics
mnb_metrics = compute_metrics(X_test, y_test, mnb, 'Naive Bayes')

In [30]:
mnb_metrics

Unnamed: 0,Accuracy Score
Naive Bayes,0.719439


# Descision Tree Classifier

In [31]:
# 3. DecisionTree
from sklearn.tree import DecisionTreeClassifier

In [32]:
# Model Object
dt_clf = DecisionTreeClassifier()

In [33]:
# Fit the object
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [34]:
dt_metrics = compute_metrics(X_test, y_test, dt_clf, "DecisionTree")

In [35]:
dt_metrics

Unnamed: 0,Accuracy Score
DecisionTree,0.687375


# XG Boost Classifier

In [36]:
from xgboost import XGBClassifier

In [37]:
# XGB model
xgb_model = XGBClassifier(n_estimators=200)

In [38]:
xgb_model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
xgb_metrics = compute_metrics(X_test, y_test, xgb_model, 'XGBClassifier')

In [40]:
# Concatenate all the metrics
model_metrics = pd.concat((lr_metrics, mnb_metrics, dt_metrics, xgb_metrics), axis=0).sort_values(by='Accuracy Score', ascending=False)

# Comparing the accuracies of all models

In [41]:
model_metrics

Unnamed: 0,Accuracy Score
LogisticRegression,0.778557
XGBClassifier,0.777555
Naive Bayes,0.719439
DecisionTree,0.687375


# Final prediction model using XG Boost

In [42]:
# Make predictions --> XGBoost
predictions = xgb_model.predict(test_data)

# Predicted dataset

In [43]:
predictions

array(['0', '0', '1', ..., '1', '0', '0'], dtype=object)