## Model competition

In [42]:
import re
import re 
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger') 
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
custom_stopwords = ['says', 'said','one','new' ,'news']
stop_words.update(custom_stopwords)

def clean_text(text):
    """ Clean text data by removing special characters and stopwords"""
    if text.startswith("b'") or text.startswith('b"'):
        text = text[2:-1]
    
    # Remove special characters
    text = bytes(text, 'utf-8').decode('unicode_escape', 'ignore')
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    # Keep only nouns and adjectives
    words = [word for word, tag in pos_tags if tag in ['NN', 'JJ']]
    #words = re.findall(r'\w+', text) 

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Reby\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Reby\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Reby\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Reby\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# fill missing values and clean text
for i in range(1, 26):
    topic = f'Top{i}'
    train[topic] = train[topic].fillna('').apply(clean_text)
    test[topic] = test[topic].fillna('').apply(clean_text)

In [44]:
train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia russian war,musharraf,russia today column south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia ossetia russia absorb georgia full sca...,alqaeda islamist backlash,condoleezza rice israeli strike iran israeli d...,busy day european union iran protest nuclear p...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help mexico kidnapping surge
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen living sossetia georgian geno...,...,israel georgian aggression,tv russian georgian,montreal canada police boy saturday,china manufacturer,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,russia much war,come trading sex food
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,georgia georgia first place,russia response georgia right,gorbachev serious blunder interest caucasus re...,russia georgia nato cold war,adorable yearold country war evidence,war georgia israeli connection,point encouraging georgia south ossetia goddam...,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man


In [45]:
test.head(3) 

Unnamed: 0,ID,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,0,2015-01-02,cancer result sheer bad luck unhealthy researc...,iran united islamic state ploy region reality ...,poll antimuslim,uk royal family prince andrew lawsuit underage...,bus destination rural northern sweden malm big...,pakistani boat india navy chase board vessel p...,sweden third mosque arson attack week,french year,...,ukrainian minister tv closure russian,palestinian president mahmoud abbas serious co...,israeli security center killed hamas,year year syria fouryear conflict,secret underground complex development wmd nuc...,web freedom major global issue,austrian journalist erich mchel presentation h...,ukraine kiev,china harvesting executed,plug russia last independent tv station
1,1,2015-01-05,high speed train trip time current,ancient egypt sunday symbolic burial site god ...,china n korean soldier world,scotland fossil fuelfree renewable energy ener...,prime minister shinzo abe monday remorse world...,sex centre prince andrew scandal teen,gay relative hamas founder deportation canada ...,number female drug iran,...,islamic state budget expected surplus islamic ...,iceland eu application lift capital,blackfield capital founder value ruble thing r...,rocket stage earth rural chinese village,dead aircraft bomb greek tanker libyan port,belgian murderer van den bleeken request belgi...,czech president ukrainian pm yatsenyuk prime m...,vietnamese search bahamian cargo ship sinking,france end ukraine,china rare
2,2,2015-01-06,oil barrel,toyota fuel cell car future,young indian couple police protection death,senior figure islamic force syria eastern prov...,fukushima rice radiation st time disaster,spanish guilty financial audit court,abdullah saudi throne,taliban commander linkedin,...,india pakistan spread km mile stretch border d...,turkey erdogan corruption authority,spacex falcon launch recovery next launch wind...,cnn gambia coup,islamic state official,libya country entry,judicial inquiry france monday country notorio...,video moment cameraman factory small town colo...,syria united republican senator john mccain fo...,india set iris telescope


In [46]:
train.Date.shape

(1611,)

add date features 

In [47]:
def add_date_feature(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

In [48]:
# add date feature to dataframe 'date'
add_date_feature(train)
add_date_feature(test)
 


In [49]:
# use tf-idf to vectorize for combined top 25 topics
from sklearn.feature_extraction.text import TfidfVectorizer

# combine all top 25 topics into one column
df_train = train.copy()
df_test = test.copy()

for i in range(1, 26):
    topic = f'Top{i}'
    df_train[topic] = df_train[topic].fillna('').apply(clean_text)
    df_test[topic] = df_test[topic].fillna('').apply(clean_text)

df_train['combined_text'] = df_train.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df_test['combined_text'] = df_test.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

df_train.head(3)



Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top20,Top21,Top22,Top23,Top24,Top25,Year,Month,Day,combined_text
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,much war,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,adorable yearold country war evidence,war georgia israeli connection,point georgia south ossetia goddamnit bush,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...


In [50]:
df_train.shape

(1611, 31)

In [68]:
# create tf-idf vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 2))
tfidf_features_train = tfidf.fit_transform(df_train['combined_text'])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

tfidf_df_train = pd.DataFrame(tfidf_features_train.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_train.shape[1])])
tfidf_df_test = pd.DataFrame(tfidf_features_test.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_test.shape[1])])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

# features 

df_train['Day_of_Week'] = df_train['Date'].dt.dayofweek
df_train['Is_Weekend'] = df_train['Day_of_Week'].apply(lambda x: 1 if x > 4 else 0)
df_train['Month_Start'] = df_train['Date'].dt.is_month_start.astype(int)
df_train['Month_End'] = df_train['Date'].dt.is_month_end.astype(int)
df_train['Quarter'] = df_train['Date'].dt.quarter

X_train = pd.concat([tfidf_df_train, df_train[['Year', 'Month', 'Day']]], axis=1)
X_test = pd.concat([tfidf_df_test, df_test[['Year', 'Month', 'Day']]], axis=1)



In [69]:
X_train.shape, df_train['Label'].shape

((1611, 5003), (1611,))

In [70]:
# split train
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_val, y_train, y_val = train_test_split(X_train, df_train['Label'], test_size=0.2, random_state=42)


In [77]:
# Random Forest
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Parameters for SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

# Parameters for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 6, 8],
    'criterion' :['gini', 'entropy']
}

# Parameters for Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# For SVM
grid_svm = GridSearchCV(SVC(), param_grid_svm, refit=True, verbose=3, cv=5)

# For Random Forest
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, refit=True, verbose=3, cv=5)

# For Logistic Regression
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, refit=True, verbose=3, cv=5)

In [78]:
grid_svm.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.605 total time=   5.6s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.601 total time=   5.3s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.601 total time=   5.9s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.603 total time=   4.9s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.603 total time=   5.1s


In [None]:
import numpy as np

# SVM Evaluation
y_val_pred_svm = grid_svm.predict(X_val)
accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")
print(classification_report(y_val, y_val_pred_svm))
print(confusion_matrix(y_val, y_val_pred_svm))
preds_test = grid_svm.predict(X_test)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_svm.csv', index=False)

# Random Forest Evaluation
y_val_pred_rf = grid_rf.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")
print(classification_report(y_val, y_val_pred_rf))
print(confusion_matrix(y_val, y_val_pred_rf))
preds_test = grid_rf.predict(X_test)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_rf.csv', index=False)


# Logistic Regression Evaluation
y_val_pred_lr = grid_lr.predict(X_val)
accuracy_lr = accuracy_score(y_val, y_val_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(classification_report(y_val, y_val_pred_lr))
print(confusion_matrix(y_val, y_val_pred_lr))
preds_test = grid_lr.predict(X_test)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_lr.csv', index=False)



In [None]:
import numpy as np
# creating a submission file for kaggle
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_2gram.csv', index=False)

In [71]:
# Logistical Regression 
"""
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val) 
preds_test = model.predict(X_test)

print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))
preds_test = model.predict(X_test)
"""

              precision    recall  f1-score   support

           0       0.43      0.08      0.13       118
           1       0.64      0.94      0.76       205

    accuracy                           0.63       323
   macro avg       0.53      0.51      0.45       323
weighted avg       0.56      0.63      0.53       323

[[  9 109]
 [ 12 193]]


In [76]:
print((model.predict(X_val) == y_val).mean() * 100.0)

62.538699690402474


In [75]:
import numpy as np
# creating a submission file for kaggle
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_2gram.csv', index=False)