## Model competition

In [155]:
import re
import re 
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger') 
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
custom_stopwords = ['says', 'said','one','new' ,'news']
stop_words.update(custom_stopwords)

def clean_text(text):
    """ Clean text data by removing special characters and stopwords"""
    if text.startswith("b'") or text.startswith('b"'):
        text = text[2:-1]
    
    # Remove special characters
    text = bytes(text, 'utf-8').decode('unicode_escape', 'ignore')
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    # Keep only nouns and adjectives
    words = [word for word, tag in pos_tags if tag in ['NN', 'JJ']]
    #words = re.findall(r'\w+', text) 

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [156]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# fill missing values and clean text
for i in range(1, 26):
    topic = f'Top{i}'
    train[topic] = train[topic].fillna('').apply(clean_text)
    test[topic] = test[topic].fillna('').apply(clean_text)

In [41]:
train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia russian war,musharraf,russia today column south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia ossetia russia absorb georgia full sca...,alqaeda islamist backlash,condoleezza rice israeli strike iran israeli d...,busy day european union iran protest nuclear p...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help mexico kidnapping surge
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen living sossetia georgian geno...,...,israel georgian aggression,tv russian georgian,montreal canada police boy saturday,china manufacturer,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,russia much war,come trading sex food
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,georgia georgia first place,russia response georgia right,gorbachev serious blunder interest caucasus re...,russia georgia nato cold war,adorable yearold country war evidence,war georgia israeli connection,point encouraging georgia south ossetia goddam...,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man


In [121]:
test.head(3) 

Unnamed: 0,ID,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,0,2015-01-02,cancer result sheer bad luck unhealthy researc...,iran united islamic state ploy region reality ...,poll antimuslim,uk royal family prince andrew lawsuit underage...,bus destination rural northern sweden malm big...,pakistani boat india navy chase board vessel p...,sweden third mosque arson attack week,french year,...,ukrainian minister tv closure russian,palestinian president mahmoud abbas serious co...,israeli security center killed hamas,year year syria fouryear conflict,secret underground complex development wmd nuc...,web freedom major global issue,austrian journalist erich mchel presentation h...,ukraine kiev,china harvesting executed,plug russia last independent tv station
1,1,2015-01-05,high speed train trip time current,ancient egypt sunday symbolic burial site god ...,china n korean soldier world,scotland fossil fuelfree renewable energy ener...,prime minister shinzo abe monday remorse world...,sex centre prince andrew scandal teen,gay relative hamas founder deportation canada ...,number female drug iran,...,islamic state budget expected surplus islamic ...,iceland eu application lift capital,blackfield capital founder value ruble thing r...,rocket stage earth rural chinese village,dead aircraft bomb greek tanker libyan port,belgian murderer van den bleeken request belgi...,czech president ukrainian pm yatsenyuk prime m...,vietnamese search bahamian cargo ship sinking,france end ukraine,china rare
2,2,2015-01-06,oil barrel,toyota fuel cell car future,young indian couple police protection death,senior figure islamic force syria eastern prov...,fukushima rice radiation st time disaster,spanish guilty financial audit court,abdullah saudi throne,taliban commander linkedin,...,india pakistan spread km mile stretch border d...,turkey erdogan corruption authority,spacex falcon launch recovery next launch wind...,cnn gambia coup,islamic state official,libya country entry,judicial inquiry france monday country notorio...,video moment cameraman factory small town colo...,syria united republican senator john mccain fo...,india set iris telescope


In [122]:
train.Date.shape

(1611,)

add date features 

In [157]:
def add_date_feature(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

In [158]:
# add date feature to dataframe 'date'
add_date_feature(train)
add_date_feature(test)
 


In [159]:
# use tf-idf to vectorize for combined top 25 topics
from sklearn.feature_extraction.text import TfidfVectorizer

# combine all top 25 topics into one column
df_train = train.copy()
df_test = test.copy()

for i in range(1, 26):
    topic = f'Top{i}'
    df_train[topic] = df_train[topic].fillna('').apply(clean_text)
    df_test[topic] = df_test[topic].fillna('').apply(clean_text)

df_train['combined_text'] = df_train.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df_test['combined_text'] = df_test.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

df_train.head(3)



Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top20,Top21,Top22,Top23,Top24,Top25,Year,Month,Day,combined_text
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,much war,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,adorable yearold country war evidence,war georgia israeli connection,point georgia south ossetia goddamnit bush,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...


In [160]:
df_test.head(3)

Unnamed: 0,ID,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top20,Top21,Top22,Top23,Top24,Top25,Year,Month,Day,combined_text
0,0,2015-01-02,cancer result sheer bad luck unhealthy researc...,iran islamic state ploy region reality united ...,poll antimuslim,uk royal family prince lawsuit underage sex,bus destination rural northern sweden malm big...,pakistani boat india navy chase board vessel p...,sweden third mosque arson attack week,french year,...,secret underground complex development wmd nuc...,web freedom major global issue,austrian journalist erich mchel presentation a...,ukraine kiev,china harvesting,plug russia last independent tv station,2015,1,2,cancer result sheer bad luck unhealthy researc...
1,1,2015-01-05,high speed trip time current,ancient egypt sunday symbolic burial site god ...,china n korean world,scotland renewable energy energy country power...,prime minister shinzo abe monday remorse world...,sex centre prince scandal teen,gay relative hamas founder deportation canada ...,number female drug iran,...,dead aircraft bomb greek tanker libyan port,belgian murderer van den bleeken request belgi...,czech president ukrainian pm yatsenyuk prime m...,vietnamese search bahamian cargo ship sinking,france end ukraine,,2015,1,5,high speed trip time current ancient egypt sun...
2,2,2015-01-06,oil barrel,toyota fuel cell car future,young indian couple police protection death,senior figure islamic force syria eastern prov...,fukushima rice radiation st time disaster,spanish guilty financial audit court,abdullah saudi throne,taliban commander linkedin,...,islamic state official,libya country entry,judicial inquiry france monday country notorio...,video moment cameraman factory small town colo...,united republican senator john mccain former f...,india iris telescope,2015,1,6,oil barrel toyota fuel cell car future young i...


In [161]:
import matplotlib.pyplot as plt
import seaborn as sns

label_counts = df_train['Label'].value_counts()
print(label_counts)

Label
1    981
0    630
Name: count, dtype: int64


In [162]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import normalize
import scipy.sparse as sp
import numpy as np

class CTFIDFVectorizer(TfidfTransformer):
    """Convert a collection of raw documents to a matrix of c-TF-IDF features (class based tf-idf) - it is not a transformer model, it is a vectorizer model (it does not learn anything) - it is a modification of the TfidfTransformer class (it inherits from it) """
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """learn idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """transform a count-based matrix to c-TF-IDF / class based tf-idf """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X

In [163]:
# create count vectorizer 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vectorizer = CountVectorizer(max_features=5000,stop_words=list(stop_words),ngram_range=(1,1), max_df=0.8, min_df=0.2)
count_features_train = vectorizer.fit_transform(df_train['combined_text'])
count_features_test = vectorizer.transform(df_test['combined_text'])

# convert to dataframe 
count_df_train = pd.DataFrame(count_features_train.toarray(), columns=[f'count_{i}' for i in range(count_features_train.shape[1])])
count_df_test = pd.DataFrame(count_features_test.toarray(), columns=[f'count_{i}' for i in range(count_features_test.shape[1])])


# ---------------------------------------

# create tf-idf vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 1))
tfidf_features_train = tfidf.fit_transform(df_train['combined_text'])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

tfidf_df_train = pd.DataFrame(tfidf_features_train.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_train.shape[1])])
tfidf_df_test = pd.DataFrame(tfidf_features_test.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_test.shape[1])])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

# features 

df_train['Day_of_Week'] = df_train['Date'].dt.dayofweek
df_train['Is_Weekend'] = df_train['Day_of_Week'].apply(lambda x: 1 if x > 4 else 0)
df_train['Month_Start'] = df_train['Date'].dt.is_month_start.astype(int)
df_train['Month_End'] = df_train['Date'].dt.is_month_end.astype(int)
df_train['Quarter'] = df_train['Date'].dt.quarter

df_test['Day_of_Week'] = df_test['Date'].dt.dayofweek
df_test['Is_Weekend'] = df_test['Day_of_Week'].apply(lambda x: 1 if x > 4 else 0)
df_test['Month_Start'] = df_test['Date'].dt.is_month_start.astype(int)
df_test['Month_End'] = df_test['Date'].dt.is_month_end.astype(int)
df_test['Quarter'] = df_test['Date'].dt.quarter


# ---------------------------------------
# Initialize your CTFIDFVectorizer
ctfidf_vectorizer = CTFIDFVectorizer()

# Fit and transform using CTFIDFVectorizer
n_samples_train = count_features_train.shape[0]
n_samples_test = count_features_test.shape[0]

df_train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Year,Month,Day,combined_text,Day_of_Week,Is_Weekend,Month_Start,Month_End,Quarter
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...,4,0,0,0,3
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...,0,0,0,0,3
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...,1,0,0,0,3


In [130]:
count_df_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_60,count_61,count_62,count_63,count_64,count_65,count_66,count_67,count_68,count_69
0,1,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,5,0,2,1
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,7,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,3,0,2,0


In [141]:
import pandas as pd
import numpy as np
# c-TF-IDF for training set
X_ctfidf_train = ctfidf_vectorizer.fit(count_features_train, n_samples_train).transform(count_features_train)
X_ctfidf_test = ctfidf_vectorizer.fit(count_features_test, n_samples_test).transform(count_features_test)
# convert to dataframe 
X_ctfidf_train = pd.DataFrame(X_ctfidf_train.toarray(), columns=[f'count_{i}' for i in range(count_features_train.shape[1])])
X_ctfidf_test = pd.DataFrame(X_ctfidf_test.toarray(), columns=[f'count_{i}' for i in range(count_features_test.shape[1])])


X_train = pd.concat([X_ctfidf_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([X_ctfidf_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [62]:
X_train = pd.concat([count_df_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([count_df_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [134]:
# only use for tfidf 
X_train = pd.concat([tfidf_df_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([tfidf_df_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [142]:
X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
0,0.047485,0.0,0.0,0.0,0.0,0.0,0.0,0.013492,0.0,0.0,...,0.008747,0.025349,2008,8,8,3,0,0,0,4
1,0.080559,0.078873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00742,0.0,2008,8,11,3,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.080285,0.0,0.0,0.0,0.0,...,0.007009,0.0,2008,8,12,3,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.055211,2008,8,13,3,0,0,0,2
4,0.068084,0.0,0.054991,0.0,0.04845,0.0,0.0,0.0,0.0,0.0,...,0.012542,0.0,2008,8,14,3,0,0,0,3


In [143]:
X_train.shape, df_train['Label'].shape

X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
0,0.047485,0.0,0.0,0.0,0.0,0.0,0.0,0.013492,0.0,0.0,...,0.008747,0.025349,2008,8,8,3,0,0,0,4
1,0.080559,0.078873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00742,0.0,2008,8,11,3,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.080285,0.0,0.0,0.0,0.0,...,0.007009,0.0,2008,8,12,3,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.055211,2008,8,13,3,0,0,0,2
4,0.068084,0.0,0.054991,0.0,0.04845,0.0,0.0,0.0,0.0,0.0,...,0.012542,0.0,2008,8,14,3,0,0,0,3


In [144]:
# split train
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle
X = X_train

y = df_train['Label']
ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
X_resampled, y_resampled = ros.fit_resample(X, y)
# count_cols = X_train.filter(regex='^count_').columns
count_cols = X_train.filter(regex='^tfidf_').columns
X_resampled[count_cols] = X_resampled[count_cols].astype(float) 


# Shuffle the dataset to ensure it's well mixed
X_resampled, y_resampled = shuffle(X_resampled, y_resampled, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [145]:
X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
294,0.056711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.03134,0.0,2009,10,8,4,0,0,0,3
1674,0.0,0.0,0.0,0.043252,0.031492,0.0,0.027915,0.012574,0.0,0.0,...,0.008152,0.0,2014,5,1,2,0,1,0,3
1379,0.0,0.0,0.051827,0.0,0.0,0.0,0.0,0.0,0.102762,0.0,...,0.00591,0.102762,2014,1,31,1,0,0,1,4
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018026,0.0,0.0,...,0.005843,0.0,2010,9,29,3,0,0,0,2
407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067958,0.0,0.0,...,0.0,0.0,2010,3,23,1,0,0,0,1


In [146]:
# Random Forest
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint



"""
params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

"""
# Parameters for SVM
param_grid_svm = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'class_weight': ['balanced']
}

# Parameters for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_features': ['log2', 'sqrt'],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['entropy'],
    'class_weight': ['balanced'],
    'bootstrap': [True]
}


# Parameters for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga'],
    'class_weight': ['balanced']
}

# Pameters for XGBoost 

scale_pos_weight = len(df_train[df_train['Label'] == 0]) / len(df_train[df_train['Label'] == 1])

param_grid_xgb = {
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'colsample_bytree': [0.3, 0.7],
    'subsample': [0.6, 1.0],
    'gamma': [0, 0.5, 1],
    'booster': ['gbtree', 'gblinear'],
    'scale_pos_weight': [scale_pos_weight]
}


def train_and_evaluate(model, grid_params, X_train, y_train, X_val, y_val, X_test, model_name):
    # Grid Search
    grid_search = GridSearchCV(model, grid_params , refit=True, verbose=3, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Validation Prediction and Evaluation
    y_val_pred = grid_search.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    print(f"{model_name} Accuracy: {accuracy}")
    print(classification_report(y_val, y_val_pred))
    print(confusion_matrix(y_val, y_val_pred))

    # Test Prediction
    preds_test = grid_search.predict(X_test)
    pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv(f'submission_{model_name}.csv', index=False)




In [147]:
# SVM
# train_and_evaluate(SVC(max_iter=1000), param_grid_svm, X_train, y_train, X_val, y_val, X_test, 'svm')

# Random Forest
train_and_evaluate(RandomForestClassifier(), param_grid_rf, X_train, y_train, X_val, y_val, X_test, 'rf')

# Logistic Regression
# train_and_evaluate(LogisticRegression(max_iter=1000), param_grid_lr, X_train, y_train, X_val, y_val, X_test, 'lr')

# XGBoost
train_and_evaluate(xgb.XGBClassifier( ), param_grid_xgb, X_train, y_train, X_val, y_val, X_test, 'xgb')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
rf Accuracy: 0.6768447837150128
              precision    recall  f1-score   support

           0       0.63      0.75      0.69       186
           1       0.73      0.61      0.66       207

    accuracy                           0.68       393
   macro avg       0.68      0.68      0.68       393
weighted avg       0.69      0.68      0.68       393

[[140  46]
 [ 81 126]]
Fitting 3 folds for each of 192 candidates, totalling 576 fits
xgb Accuracy: 0.6692111959287532
              precision    recall  f1-score   support

           0       0.62      0.79      0.69       186
           1       0.75      0.56      0.64       207

    accuracy                           0.67       393
   macro avg       0.68      0.68      0.67       393
weighted avg       0.69      0.67      0.67       393

[[147  39]
 [ 91 116]]


In [164]:
df_train.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Year,Month,Day,combined_text,Day_of_Week,Is_Weekend,Month_Start,Month_End,Quarter
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...,4,0,0,0,3
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...,0,0,0,0,3
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...,1,0,0,0,3
3,2008-08-13,0,israel iran report,president capital come,israel cameraman,policy tough pointless former civil servant unit,body year old trunk ransom victim mexico head ...,prefab,bush operation,russian georgian,...,nobel laureate aleksander solzhenitsyn russia,2008,8,13,israel iran report president capital come isra...,2,0,0,0,3
4,2008-08-14,1,,war south osetia russian soldier,swedish wrestler ara abrahamian medal olympic ...,russia death toll south ossetia,missile pakistan cia,rushdie condemns random novel fear muslim reta...,poland defense deal interesting timing,tblisi bet,...,peace assurance,2008,8,14,war south osetia russian soldier swedish wres...,3,0,0,0,3


In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Preprocessing helpers
text_imputer = SimpleImputer(strategy='constant', fill_value='') 
num_imputer = SimpleImputer(strategy='mean') 
scaler = StandardScaler()

# Initialize variables
models = {}
text_columns = ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']  # Textual features
probabilities = []
individual_models = {}
num_columns = ['Year', 'Month', 'Day', 'Quarter', 'Is_Weekend', 'Month_Start', 'Month_End', 'Day_of_Week']  

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['Date','Label'], axis=1), df_train['Label'], test_size=0.2, random_state=42, shuffle=True)


# Train models for each feature
for col in df_train.columns:

    if col in text_columns:
        # Impute missing values for text columns
        X_train_feature = text_imputer.fit_transform(X_train[[col]])
        X_val_feature = text_imputer.transform(X_val[[col]])
        vectorizer = TfidfVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 1))
        X_train_transformed = vectorizer.fit_transform(X_train_feature.ravel())
        X_val_transformed = vectorizer.transform(X_val_feature.ravel())
    elif col in num_columns:
        # Convert to numeric, handle missing values, and scale for numerical columns
        X_train_feature = pd.to_numeric(X_train[col], errors='coerce').values.reshape(-1, 1)
        X_val_feature = pd.to_numeric(X_val[col], errors='coerce').values.reshape(-1, 1)
        X_train_transformed = num_imputer.fit_transform(X_train_feature)
        X_val_transformed = num_imputer.transform(X_val_feature)
        X_train_transformed = scaler.fit_transform(X_train_transformed)
        X_val_transformed = scaler.transform(X_val_transformed)
    else:
        continue  # Skip non-numeric, non-text columns

    # Train the model
    #model = LogisticRegression(max_iter=1000, C=1, penalty='l2', solver='lbfgs', random_state=42, n_jobs=-1)
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=8, max_features='log2', criterion='entropy',class_weight='balanced')
    model.fit(X_train_transformed, y_train)
    models[col] = (model, vectorizer if col in text_columns else None)

    # Store the probabilities for positive class
    probabilities.append(model.predict_proba(X_val_transformed)[:, 1])


# Average the probabilities
average_prob = np.mean(probabilities, axis=0)

# Convert average probabilities to binary predictions
final_predictions = (average_prob > 0.5).astype(int)
print(final_predictions)
# Evaluate the final predictions
final_accuracy = accuracy_score(y_val, final_predictions)
print(f"Final Accuracy on Feature-wise Validation Set: {final_accuracy}") # 0.6346749226006192


[0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 1 1 1 0
 1 0 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0
 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1 0 1 1 1 1 0 0 0 1
 0 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 1
 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 0 0 1 1 0 0 1 0
 1 1 0 0 1 1 1 1 1 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1
 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 1 0 0 1 0 1 0 1 0 1 1 1 1 0 1 0 0 0 1 0 1 1
 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0
 1 0 1 1 0 1 0 0 0 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1]
Final Accuracy on Feature-wise Validation Set: 0.5541795665634675


In [178]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

stacked_predictions_train = pd.DataFrame()
stacked_predictions_test = pd.DataFrame()
top_columns = ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']  # Textual features
scale_pos_weight = len(df_train[df_train['Label'] == 0]) / len(df_train[df_train['Label'] == 1])

# Define your base learners
base_learners = [
    ('xgb', xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, colsample_bytree=0.7, subsample=1.0, gamma=0.5, booster='gbtree', scale_pos_weight=scale_pos_weight, n_jobs=-1, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', max_depth=8, max_features='log2', criterion='entropy', n_jobs=-1))
]

param_grid_meta = {
    'C': [0.1, 1, 10],
    'max_iter': [100, 500, 1000],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'class_weight': ['balanced']

}
meta_learner = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42, n_jobs=-1), 
                                param_grid_meta, 
                                cv=5, 
                                scoring='accuracy', 
                                n_jobs=-1)


for column in top_columns:
    df_train[column] = df_train[column].fillna('').apply(clean_text)
    X = df_train[column]
    y = df_train['Label']

    # Text Vectorization
    vectorizer = TfidfVectorizer( max_features=5000, stop_words=list(stop_words), ngram_range=(1, 1))
    X_vectorized = vectorizer.fit_transform(X)

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42, shuffle=True)

    # Stacking Classifier
    stack_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)
    stack_clf.fit(X_train, y_train)

    # Collect predictions for train and test set
    stacked_predictions_train[column] = stack_clf.predict(X_train)
    stacked_predictions_test[column] = stack_clf.predict(X_test)

# Train meta-learner
meta_learner.fit(stacked_predictions_train, y_train)

# Predict on the test set
final_predictions = meta_learner.predict(stacked_predictions_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, final_predictions)
print(f"Overall Accuracy: {accuracy}")


"""
# Define hyperparameters for GridSearchCV for each base learner
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    # Add more parameters as needed
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    # Add more parameters as needed
}

# Function to train and return a model with GridSearchCV
def train_model(X, y, model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)
    return grid_search.best_estimator_

# Train and collect predictions
for column in top_columns:
    X = df_train[column]
    y = df_train['Label']
    vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words, ngram_range=(1, 1))
    X_vectorized = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42, shuffle=True)

    best_rf = train_model(X_train, y_train, RandomForestClassifier(), param_grid_rf)
    best_xgb = train_model(X_train, y_train, xgb.XGBClassifier(), param_grid_xgb)

    stacked_predictions_train[column] = best_rf.predict(X_train) + best_xgb.predict(X_train)
    stacked_predictions_test[column] = best_rf.predict(X_test) + best_xgb.predict(X_test)

"""


Overall Accuracy: 0.5386996904024768


In [184]:
# Train meta-learner
meta_learner.fit(stacked_predictions_train, y_train)



In [185]:
# Predict on the test set
final_predictions = meta_learner.predict(stacked_predictions_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, final_predictions)
print(f"Overall Accuracy: {accuracy}")


Overall Accuracy: 0.5386996904024768
