In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.model_selection import train_test_split
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import nltk
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import *
import re
import string
from sklearn.metrics import classification_report

In [2]:
stop_words = set(stopwords.words('english'))
wn = nltk.WordNetLemmatizer()
worddict = set(nltk.corpus.words.words())
english_punctuations = string.punctuation
punctuations_list = english_punctuations
# def clean_text(text):
#     return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())
def merge_key_word_text(x,y):
    tem=""
    if str(x)!='nan':
        tem+=(str(x)+" ")
    return tem+str(y)
def clean_text(text):
    text= re.sub("'", "", text)
    text = re.sub('@[A-Za-z0-9_]+', '', text) #removes @mentions
    text = re.sub('#','',text) #removes hastag '#' symbol
    text = re.sub('RT[\s]+','',text)
    text = re.sub('https?:\/\/\S+', '', text) 
    text = re.sub('\n',' ',text)
    tem="".join([char for char in text if char not in string.punctuation])
    clean_str = ''.join([c for c in tem if ord(c) < 128])
    return clean_str
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1\1+', r'\1\1', text)#(.)\1{3,}
def preprocessing(text):
    new_text = clean_text(text.strip())
    new_text = cleaning_numbers(new_text)
    new_text = cleaning_repeating_char(new_text)
    tokens = word_tokenize(new_text.lower())
    new_text = " ".join([wn.lemmatize(w) for w in tokens if not w in stop_words])
    #text=" ".join([wn.lemmatize(word) for word in word_tokenize(new_text)])
    return new_text
#     wordset_n = set(wn.lemmatize(w, NOUN) for w in word_tokenize(new_text.lower().strip()))
#     wordset_v = set(wn.lemmatize(w, VERB) for w in wordset_n)
#     wordset = set(wn.lemmatize(w, ADJ) for w in wordset_v)
#     wordset = wordset & worddict
#     return ' '.join(list(wordset))

In [3]:
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("test.csv")

In [4]:
data_train['m_text']=data_train.apply(lambda row: merge_key_word_text(row['keyword'],row['text']),axis=1)
data_test['m_text']=data_test.apply(lambda row: merge_key_word_text(row['keyword'],row['text']),axis=1)
data_train['new_text']=data_train['m_text'].apply(lambda x: preprocessing(x))
data_test['new_text']=data_test['m_text'].apply(lambda x: preprocessing(x))
data_train.to_csv('tem.csv')

In [5]:
features='new_text'#['new_text','keyword_target','location_clean_target']
x_train, x_test, y_train, y_test = train_test_split(
    data_train[features],
    data_train['target'],stratify=data_train['target'],
    test_size=0.2,
    random_state=1)

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer='word',max_df=0.95)
x_train_tf = tfidf_vect.fit_transform(x_train)
x_pre_tf = tfidf_vect.transform(x_test)
x_test_tf = tfidf_vect.transform(data_test['new_text'])

In [7]:
# m_NB = MultinomialNB(alpha=1, fit_prior=True, class_prior=None)
# m_NB
BNBmodel = BernoulliNB(alpha=1.1, fit_prior=True, class_prior=None)
BNBmodel.fit(x_train_tf, y_train) # train the classifier

# convert list to matrix
# tem=x_test[['keyword_target','location_clean_target']]
# x_pre_f=pd.DataFrame(x_pre_tf.toarray(), columns=tfidf_vect.get_feature_names_out(),index=tem.index)
# x_pre_f=pd.concat([x_pre_f,tem],axis=1)
predicted = BNBmodel.predict(x_pre_tf)
cr5    = classification_report(y_test,predicted)
print(cr5)
metrics.accuracy_score(list(y_test), predicted)

              precision    recall  f1-score   support

           0       0.78      0.92      0.84       869
           1       0.86      0.66      0.74       654

    accuracy                           0.80      1523
   macro avg       0.82      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



0.8049901510177282

In [8]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[797,  72],
       [225, 429]], dtype=int64)

In [9]:
predicted =BNBmodel.predict(x_test_tf)
pd.DataFrame({'id':data_test['id'],'target':predicted}).to_csv('submission_b_nb.csv',index=False)

In [10]:
from sklearn.linear_model import LogisticRegression
model_l =LogisticRegression() #LogisticRegression(C=0.7,solver='lbfgs',fit_intercept=False)#model_l = LogisticRegression(C=1.2)
model_l.fit(x_train_tf, y_train)
predicted = model_l.predict(x_pre_tf)
cr5    = classification_report(y_test,predicted)
print(cr5)
metrics.accuracy_score(list(y_test), predicted)

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       869
           1       0.81      0.69      0.74       654

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



0.7964543663821405

In [11]:
from sklearn.model_selection import GridSearchCV
hyperparameters = {
    'solver':['lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2'],
    'C': [1,1.05,1.1,1.15,1.2,1.25,1.3,1.35,1.4,1.45,1.5],
    'class_weight':['balanced',None]
}
model = LogisticRegression()
grid_search = GridSearchCV(model, hyperparameters, cv=5)
grid_search.fit(x_train_tf, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_pre_tf)
cr5    = classification_report(y_test,predicted)
print(cr5)
metrics.accuracy_score(list(y_test), predicted)

110 fits failed out of a total of 440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Sajin.LAPTOP-RE0DL8PH\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Sajin.LAPTOP-RE0DL8PH\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Sajin.LAPTOP-RE0DL8PH\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalt

              precision    recall  f1-score   support

           0       0.79      0.88      0.83       869
           1       0.81      0.69      0.74       654

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



0.7964543663821405

In [12]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[765, 104],
       [206, 448]], dtype=int64)

In [13]:
predicted = best_model.predict(x_test_tf)
pd.DataFrame({'id':data_test['id'],'target':predicted}).to_csv('submission_cv_log.csv',index=False)

In [14]:
from sklearn.linear_model import LogisticRegression
model_l =LogisticRegression(C=1.15,solver='liblinear',class_weight='balanced') #LogisticRegression(C=0.7,solver='lbfgs',fit_intercept=False)#model_l = LogisticRegression(C=1.2)
model_l.fit(x_train_tf, y_train)
predicted = model_l.predict(x_pre_tf)
cr5    = classification_report(y_test,predicted)
print(cr5)
metrics.accuracy_score(list(y_test), predicted)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       869
           1       0.76      0.75      0.75       654

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



0.7898883782009193

In [15]:
cm = confusion_matrix(list(y_test), predicted)
cm

array([[710, 159],
       [161, 493]], dtype=int64)

In [16]:
predicted = model_l.predict(x_test_tf)
pd.DataFrame({'id':data_test['id'],'target':predicted}).to_csv('submission_l_log.csv',index=False)

In [17]:
tem=pd.read_csv('submission_b_nb.csv')
tem_2=pd.read_csv('model_submission.csv')
c5=classification_report(tem_2[['target']],tem[['target']])
print(c5)
sklearn.metrics.f1_score(tem_2[['target']],tem[['target']])

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      2084
           1       0.86      0.77      0.81      1179

    accuracy                           0.87      3263
   macro avg       0.87      0.85      0.86      3263
weighted avg       0.87      0.87      0.87      3263



0.8119429590017826

In [18]:
tem_3=pd.read_csv('submission_cv_log.csv')
c5=classification_report(tem_2[['target']],tem_3[['target']])
print(c5)
sklearn.metrics.f1_score(tem_2[['target']],tem_3[['target']])

              precision    recall  f1-score   support

           0       0.88      0.88      0.88      2084
           1       0.79      0.79      0.79      1179

    accuracy                           0.85      3263
   macro avg       0.83      0.84      0.84      3263
weighted avg       0.85      0.85      0.85      3263



0.7900295732995354

In [19]:
tem_4=pd.read_csv('submission_l_log.csv')
c5=classification_report(tem_2[['target']],tem_4[['target']])
print(c5)
sklearn.metrics.f1_score(tem_2[['target']],tem_4[['target']])

              precision    recall  f1-score   support

           0       0.90      0.82      0.86      2084
           1       0.72      0.84      0.78      1179

    accuracy                           0.83      3263
   macro avg       0.81      0.83      0.82      3263
weighted avg       0.84      0.83      0.83      3263



0.7776475185619383