## Model competition

In [54]:
import re
import re 
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('punkt')  # For tokenization
nltk.download('averaged_perceptron_tagger') 
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
custom_stopwords = ['says', 'said','one','new' ,'news']
stop_words.update(custom_stopwords)

def clean_text(text):
    """ Clean text data by removing special characters and stopwords"""
    if text.startswith("b'") or text.startswith('b"'):
        text = text[2:-1]
    
    # Remove special characters
    text = bytes(text, 'utf-8').decode('unicode_escape', 'ignore')
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    # Keep only nouns and adjectives
    words = [word for word, tag in pos_tags if tag in ['NN', 'JJ']]
    #words = re.findall(r'\w+', text) 

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    return ' '.join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahnki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# fill missing values and clean text
for i in range(1, 26):
    topic = f'Top{i}'
    train[topic] = train[topic].fillna('').apply(clean_text)
    test[topic] = test[topic].fillna('').apply(clean_text)

In [56]:
train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia russian war,musharraf,russia today column south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia ossetia russia absorb georgia full sca...,alqaeda islamist backlash,condoleezza rice israeli strike iran israeli d...,busy day european union iran protest nuclear p...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help mexico kidnapping surge
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen living sossetia georgian geno...,...,israel georgian aggression,tv russian georgian,montreal canada police boy saturday,china manufacturer,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,russia much war,come trading sex food
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,georgia georgia first place,russia response georgia right,gorbachev serious blunder interest caucasus re...,russia georgia nato cold war,adorable yearold country war evidence,war georgia israeli connection,point encouraging georgia south ossetia goddam...,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man


In [57]:
test.head(3) 

Unnamed: 0,ID,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,0,2015-01-02,cancer result sheer bad luck unhealthy researc...,iran united islamic state ploy region reality ...,poll antimuslim,uk royal family prince andrew lawsuit underage...,bus destination rural northern sweden malm big...,pakistani boat india navy chase board vessel p...,sweden third mosque arson attack week,french year,...,ukrainian minister tv closure russian,palestinian president mahmoud abbas serious co...,israeli security center killed hamas,year year syria fouryear conflict,secret underground complex development wmd nuc...,web freedom major global issue,austrian journalist erich mchel presentation h...,ukraine kiev,china harvesting executed,plug russia last independent tv station
1,1,2015-01-05,high speed train trip time current,ancient egypt sunday symbolic burial site god ...,china n korean soldier world,scotland fossil fuelfree renewable energy ener...,prime minister shinzo abe monday remorse world...,sex centre prince andrew scandal teen,gay relative hamas founder deportation canada ...,number female drug iran,...,islamic state budget expected surplus islamic ...,iceland eu application lift capital,blackfield capital founder value ruble thing r...,rocket stage earth rural chinese village,dead aircraft bomb greek tanker libyan port,belgian murderer van den bleeken request belgi...,czech president ukrainian pm yatsenyuk prime m...,vietnamese search bahamian cargo ship sinking,france end ukraine,china rare
2,2,2015-01-06,oil barrel,toyota fuel cell car future,young indian couple police protection death,senior figure islamic force syria eastern prov...,fukushima rice radiation st time disaster,spanish guilty financial audit court,abdullah saudi throne,taliban commander linkedin,...,india pakistan spread km mile stretch border d...,turkey erdogan corruption authority,spacex falcon launch recovery next launch wind...,cnn gambia coup,islamic state official,libya country entry,judicial inquiry france monday country notorio...,video moment cameraman factory small town colo...,syria united republican senator john mccain fo...,india set iris telescope


In [58]:
train.Date.shape

(1611,)

add date features 

In [59]:
def add_date_feature(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day

In [60]:
# add date feature to dataframe 'date'
add_date_feature(train)
add_date_feature(test)
 


In [61]:
# use tf-idf to vectorize for combined top 25 topics
from sklearn.feature_extraction.text import TfidfVectorizer

# combine all top 25 topics into one column
df_train = train.copy()
df_test = test.copy()

for i in range(1, 26):
    topic = f'Top{i}'
    df_train[topic] = df_train[topic].fillna('').apply(clean_text)
    df_test[topic] = df_test[topic].fillna('').apply(clean_text)

df_train['combined_text'] = df_train.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df_test['combined_text'] = df_test.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

df_train.head(3)



Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top20,Top21,Top22,Top23,Top24,Top25,Year,Month,Day,combined_text
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,georgia iraq russian georgia breakaway region ...,pentagon iran bad idea world report,caucasus crisis georgia ossetia,indian shoe manufactory series work,mental,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,war south ossetia,israeli group state torture,russia united head peak oil,question georgia russia conflict,much war,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,adorable yearold country war evidence,war georgia israeli connection,point georgia south ossetia goddamnit bush,christopher georgian invasion south ossetia ru...,mexico,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...


In [62]:
df_test.head(3)

Unnamed: 0,ID,Date,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top20,Top21,Top22,Top23,Top24,Top25,Year,Month,Day,combined_text
0,0,2015-01-02,cancer result sheer bad luck unhealthy researc...,iran islamic state ploy region reality united ...,poll antimuslim,uk royal family prince lawsuit underage sex,bus destination rural northern sweden malm big...,pakistani boat india navy chase board vessel p...,sweden third mosque arson attack week,french year,...,secret underground complex development wmd nuc...,web freedom major global issue,austrian journalist erich mchel presentation a...,ukraine kiev,china harvesting,plug russia last independent tv station,2015,1,2,cancer result sheer bad luck unhealthy researc...
1,1,2015-01-05,high speed trip time current,ancient egypt sunday symbolic burial site god ...,china n korean world,scotland renewable energy energy country power...,prime minister shinzo abe monday remorse world...,sex centre prince scandal teen,gay relative hamas founder deportation canada ...,number female drug iran,...,dead aircraft bomb greek tanker libyan port,belgian murderer van den bleeken request belgi...,czech president ukrainian pm yatsenyuk prime m...,vietnamese search bahamian cargo ship sinking,france end ukraine,,2015,1,5,high speed trip time current ancient egypt sun...
2,2,2015-01-06,oil barrel,toyota fuel cell car future,young indian couple police protection death,senior figure islamic force syria eastern prov...,fukushima rice radiation st time disaster,spanish guilty financial audit court,abdullah saudi throne,taliban commander linkedin,...,islamic state official,libya country entry,judicial inquiry france monday country notorio...,video moment cameraman factory small town colo...,united republican senator john mccain former f...,india iris telescope,2015,1,6,oil barrel toyota fuel cell car future young i...


In [40]:
import matplotlib.pyplot as plt
import seaborn as sns

label_counts = df_train['Label'].value_counts()
print(label_counts)

Label
1    981
0    630
Name: count, dtype: int64


In [63]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.preprocessing import normalize
import scipy.sparse as sp

class CTFIDFVectorizer(TfidfTransformer):
    """Convert a collection of raw documents to a matrix of c-TF-IDF features (class based tf-idf) - it is not a transformer model, it is a vectorizer model (it does not learn anything) - it is a modification of the TfidfTransformer class (it inherits from it) """
    def __init__(self, *args, **kwargs):
        super(CTFIDFVectorizer, self).__init__(*args, **kwargs)

    def fit(self, X: sp.csr_matrix, n_samples: int):
        """learn idf vector (global term weights) """
        _, n_features = X.shape
        df = np.squeeze(np.asarray(X.sum(axis=0)))
        idf = np.log(n_samples / df)
        self._idf_diag = sp.diags(idf, offsets=0,
                                  shape=(n_features, n_features),
                                  format='csr',
                                  dtype=np.float64)
        return self

    def transform(self, X: sp.csr_matrix) -> sp.csr_matrix:
        """transform a count-based matrix to c-TF-IDF / class based tf-idf """
        X = X * self._idf_diag
        X = normalize(X, axis=1, norm='l1', copy=False)
        return X

In [86]:
# create count vectorizer 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

vectorizer = CountVectorizer(max_features=5000,stop_words=list(stop_words),ngram_range=(1,1), max_df=0.8, min_df=0.2)
count_features_train = vectorizer.fit_transform(df_train['combined_text'])
count_features_test = vectorizer.transform(df_test['combined_text'])

# convert to dataframe 
count_df_train = pd.DataFrame(count_features_train.toarray(), columns=[f'count_{i}' for i in range(count_features_train.shape[1])])
count_df_test = pd.DataFrame(count_features_test.toarray(), columns=[f'count_{i}' for i in range(count_features_test.shape[1])])


# ---------------------------------------

# create tf-idf vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words=list(stop_words), ngram_range=(1, 1))
tfidf_features_train = tfidf.fit_transform(df_train['combined_text'])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

tfidf_df_train = pd.DataFrame(tfidf_features_train.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_train.shape[1])])
tfidf_df_test = pd.DataFrame(tfidf_features_test.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_features_test.shape[1])])
tfidf_features_test = tfidf.transform(df_test['combined_text'])

# features 

df_train['Day_of_Week'] = df_train['Date'].dt.dayofweek
df_train['Is_Weekend'] = df_train['Day_of_Week'].apply(lambda x: 1 if x > 4 else 0)
df_train['Month_Start'] = df_train['Date'].dt.is_month_start.astype(int)
df_train['Month_End'] = df_train['Date'].dt.is_month_end.astype(int)
df_train['Quarter'] = df_train['Date'].dt.quarter

df_test['Day_of_Week'] = df_test['Date'].dt.dayofweek
df_test['Is_Weekend'] = df_test['Day_of_Week'].apply(lambda x: 1 if x > 4 else 0)
df_test['Month_Start'] = df_test['Date'].dt.is_month_start.astype(int)
df_test['Month_End'] = df_test['Date'].dt.is_month_end.astype(int)
df_test['Quarter'] = df_test['Date'].dt.quarter


# ---------------------------------------
# Initialize your CTFIDFVectorizer
ctfidf_vectorizer = CTFIDFVectorizer()

# Fit and transform using CTFIDFVectorizer
n_samples_train = count_features_train.shape[0]
n_samples_test = count_features_test.shape[0]

df_train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Year,Month,Day,combined_text,Day_of_Week,Is_Weekend,Month_Start,Month_End,Quarter
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...,4,0,0,0,3
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...,0,0,0,0,3
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...,1,0,0,0,3


In [65]:
count_df_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_60,count_61,count_62,count_63,count_64,count_65,count_66,count_67,count_68,count_69
0,1,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,5,0,2,1
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3,0,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,7,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,3,0,2,0


In [15]:
import pandas as pd
import numpy as np
# c-TF-IDF for training set
X_ctfidf_train = ctfidf_vectorizer.fit(count_features_train, n_samples_train).transform(count_features_train)
X_ctfidf_test = ctfidf_vectorizer.fit(count_features_test, n_samples_test).transform(count_features_test)
# convert to dataframe 
X_ctfidf_train = pd.DataFrame(X_ctfidf_train.toarray(), columns=[f'count_{i}' for i in range(count_features_train.shape[1])])
X_ctfidf_test = pd.DataFrame(X_ctfidf_test.toarray(), columns=[f'count_{i}' for i in range(count_features_test.shape[1])])


X_train = pd.concat([X_ctfidf_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([X_ctfidf_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [87]:
X_train = pd.concat([count_df_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([count_df_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [81]:
# only use for tfidf 
X_train = pd.concat([tfidf_df_train, df_train[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)
X_test = pd.concat([tfidf_df_test, df_test[['Year', 'Month', 'Day','Quarter','Is_Weekend','Month_Start','Month_End','Day_of_Week']]], axis=1)

In [88]:
X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
0,1,0,0,0,0,0,0,1,0,0,...,2,1,2008,8,8,3,0,0,0,4
1,1,1,0,0,0,0,0,0,0,0,...,1,0,2008,8,11,3,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,1,0,2008,8,12,3,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,2008,8,13,3,0,0,0,2
4,1,0,1,0,1,0,0,0,0,0,...,2,0,2008,8,14,3,0,0,0,3


In [90]:
X_train.shape, df_train['Label'].shape

X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
0,1,0,0,0,0,0,0,1,0,0,...,2,1,2008,8,8,3,0,0,0,4
1,1,1,0,0,0,0,0,0,0,0,...,1,0,2008,8,11,3,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,1,0,2008,8,12,3,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,2008,8,13,3,0,0,0,2
4,1,0,1,0,1,0,0,0,0,0,...,2,0,2008,8,14,3,0,0,0,3


In [91]:
# split train
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle
X = X_train
y = df_train['Label']
ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
X_resampled, y_resampled = ros.fit_resample(X, y)

# Shuffle the dataset to ensure it's well mixed
X_resampled, y_resampled = shuffle(X_resampled, y_resampled, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [92]:
X_train.head()

Unnamed: 0,count_0,count_1,count_2,count_3,count_4,count_5,count_6,count_7,count_8,count_9,...,count_68,count_69,Year,Month,Day,Quarter,Is_Weekend,Month_Start,Month_End,Day_of_Week
294,1,0,0,0,0,0,0,0,0,0,...,6,0,2009,10,8,4,0,0,0,3
1674,0,0,0,1,1,0,1,1,0,0,...,2,0,2014,5,1,2,0,1,0,3
1379,0,0,1,0,0,0,0,0,3,0,...,1,3,2014,1,31,1,0,0,1,4
539,0,0,0,0,0,0,0,1,0,0,...,1,0,2010,9,29,3,0,0,0,2
407,0,0,0,0,0,0,0,2,0,0,...,0,0,2010,3,23,1,0,0,0,1


In [95]:
# Random Forest
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint


xgb_model = xgb.XGBRegressor()

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

# Pameters for XGBoost 

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'colsample_bytree': [0.3, 0.7],
    'subsample': [0.6, 0.8],
    'gamma': [0, 0.5],
    'boosting_type' : ['gbtree', 'dart'],
    'booster' : ['gbtree', 'gblinear', 'dart']
}

# Parameters for SVM
param_grid_svm = {
    'C': [ 0.1, 1, 5],
    'gamma': [1, 0.1],
    'kernel': ['rbf', 'linear']
}

# Parameters for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_features': ['log2', 'sqrt'],
    'max_depth' : [4, 6],
    'criterion' :['entropy'] # entropy
}


# Parameters for Logistic Regression
param_grid_lr = {
    'C': [ 1, 10],
    'penalty': ['l2'],
    'solver': [ 'lbfgs']
}

# For SVM
grid_svm = GridSearchCV(SVC(max_iter=1000), param_grid_svm, refit=True, verbose=3, cv=3, n_jobs=-1)

# For Random Forest
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, refit=True, verbose=3, cv=3, n_jobs=-1)

# For Logistic Regression
grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, refit=True, verbose=3, cv=3, n_jobs=-1)




In [100]:
grid_svm.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_lr.fit(X_train, y_train)
grid_search_xgb.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits




Fitting 3 folds for each of 8 candidates, totalling 24 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 384 candidates, totalling 1152 fits


KeyboardInterrupt: 

In [99]:
import numpy as np

# SVM Evaluation
y_val_pred_svm = grid_svm.predict(X_val)
accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Accuracy: {accuracy_svm}")
print(classification_report(y_val, y_val_pred_svm))
print(confusion_matrix(y_val, y_val_pred_svm))
preds_test = grid_svm.predict(X_test)

print((grid_svm.predict(X_val) == y_val).mean() * 100.0)

pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_svm_count.csv', index=False)

# Random Forest Evaluation
y_val_pred_rf = grid_rf.predict(X_val)
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")
print(classification_report(y_val, y_val_pred_rf))
print(confusion_matrix(y_val, y_val_pred_rf))
preds_test = grid_rf.predict(X_test)

print((grid_rf.predict(X_val) == y_val).mean() * 100.0)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_rf_tfidf.csv', index=False)


# Logistic Regression Evaluation
y_val_pred_lr = grid_lr.predict(X_val)
accuracy_lr = accuracy_score(y_val, y_val_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr}")
print(classification_report(y_val, y_val_pred_lr))
print(confusion_matrix(y_val, y_val_pred_lr))
preds_test = grid_lr.predict(X_test)

print((grid_lr.predict(X_val) == y_val).mean() * 100.0)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_lr_tfidf.csv', index=False)


# XGBoost Evaluation
y_val_pred_xgb = grid_xgb.predict(X_val)
accuracy_xgb = accuracy_score(y_val, y_val_pred_xgb)
print(f"XGB Accuracy: {accuracy_xgb}")
print(classification_report(y_val, y_val_pred_xgb))
print(confusion_matrix(y_val, y_val_pred_xgb))
preds_test = grid_xgb.predict(X_test)

print((grid_xgb.predict(X_val) == y_val).mean() * 100.0)
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_xgb_tfidf.csv', index=False)



SVM Accuracy: 0.7964376590330788
              precision    recall  f1-score   support

           0       1.00      0.57      0.73       186
           1       0.72      1.00      0.84       207

    accuracy                           0.80       393
   macro avg       0.86      0.78      0.78       393
weighted avg       0.85      0.80      0.79       393

[[106  80]
 [  0 207]]
79.64376590330788
Random Forest Accuracy: 0.6361323155216285
              precision    recall  f1-score   support

           0       0.59      0.78      0.67       186
           1       0.72      0.51      0.59       207

    accuracy                           0.64       393
   macro avg       0.65      0.64      0.63       393
weighted avg       0.66      0.64      0.63       393

[[145  41]
 [102 105]]
63.61323155216285
Logistic Regression Accuracy: 0.5776081424936387
              precision    recall  f1-score   support

           0       0.55      0.61      0.58       186
           1       0.61      0

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [71]:
# Logistical Regression 
"""
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val) 
preds_test = model.predict(X_test)

print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))
preds_test = model.predict(X_test)
"""

              precision    recall  f1-score   support

           0       0.43      0.08      0.13       118
           1       0.64      0.94      0.76       205

    accuracy                           0.63       323
   macro avg       0.53      0.51      0.45       323
weighted avg       0.56      0.63      0.53       323

[[  9 109]
 [ 12 193]]


In [75]:
import numpy as np
# creating a submission file for kaggle
pd.DataFrame({'ID': np.arange(len(preds_test)), 'Label': preds_test}).to_csv('submission_2gram.csv', index=False)

In [99]:
df_train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Year,Month,Day,combined_text,Day_of_Week,Is_Weekend,Month_Start,Month_End,Quarter
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...,4,0,0,0,3
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...,0,0,0,0,3
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...,1,0,0,0,3


In [107]:
df_train.head(3)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top25,Year,Month,Day,combined_text,Day_of_Week,Is_Weekend,Month_Start,Month_End,Quarter
0,2008-08-08,0,georgia russian war,musharraf,russia today south ossetia footage youtube,russian capital south ossetia georgian artille...,afghan impunity official sick year old nothing,russian south ossetia whilst georgia russian,georgia ossetia russia side,enemy combatent nothing sham salim haman,...,help kidnapping surge,2008,8,8,georgia russian war musharraf russia today sou...,4,0,0,0,3
1,2008-08-11,1,wont america nato iraq,bush georgian conflict,jewish georgian minister training russia,georgian army disarray gori shot,olympic ceremony,mossad fraudulent zealand iraq,russia israeli military sale,american citizen sossetia georgian genocide in...,...,come trading sex food,2008,8,11,wont america nato iraq bush georgian conflict ...,0,0,0,0,3
2,2008-08-12,0,adorable yearold opening,russia georgia operation,sexual harassment,alqaeda support iraq brutal crackdown unislamic,ceasefire georgia putin west,microsoft intel laptop,russogeorgian war balance power,im sense whole georgiarussia war vote,...,bbc asiapacific extinction man,2008,8,12,adorable yearold opening russia georgia operat...,1,0,0,0,3


In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Preprocessing helpers
text_imputer = SimpleImputer(strategy='constant', fill_value='') 
num_imputer = SimpleImputer(strategy='mean') 
scaler = StandardScaler()

# Initialize variables
models = {}
text_columns = ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']  # Textual features
probabilities = []
num_columns = ['Year', 'Month', 'Day', 'Quarter', 'Is_Weekend', 'Month_Start', 'Month_End', 'Day_of_Week']  

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['Date','Label'], axis=1), df_train['Label'], test_size=0.2, random_state=42)


# Train models for each feature
for col in df_train.columns:

    if col in text_columns:
        # Impute missing values for text columns
        X_train_feature = text_imputer.fit_transform(X_train[[col]])
        X_val_feature = text_imputer.transform(X_val[[col]])
        vectorizer = TfidfVectorizer(max_features=10, stop_words=list(stop_words))
        X_train_transformed = vectorizer.fit_transform(X_train_feature.ravel())
        X_val_transformed = vectorizer.transform(X_val_feature.ravel())
    elif col in num_columns:
        # Convert to numeric, handle missing values, and scale for numerical columns
        X_train_feature = pd.to_numeric(X_train[col], errors='coerce').values.reshape(-1, 1)
        X_val_feature = pd.to_numeric(X_val[col], errors='coerce').values.reshape(-1, 1)
        X_train_transformed = num_imputer.fit_transform(X_train_feature)
        X_val_transformed = num_imputer.transform(X_val_feature)
        X_train_transformed = scaler.fit_transform(X_train_transformed)
        X_val_transformed = scaler.transform(X_val_transformed)
    else:
        continue  # Skip non-numeric, non-text columns

    # Train the model
    #model = LogisticRegression(max_iter=1000, C=1, penalty='l2', solver='lbfgs', random_state=42, n_jobs=-1)
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=6, max_features='log2', criterion='entropy')
    model.fit(X_train_transformed, y_train)
    models[col] = (model, vectorizer if col in text_columns else None)

    # Store the probabilities for positive class
    probabilities.append(model.predict_proba(X_val_transformed)[:, 1])


# Average the probabilities
average_prob = np.mean(probabilities, axis=0)

# Convert average probabilities to binary predictions
final_predictions = (average_prob > 0.5).astype(int)
print(final_predictions)
# Evaluate the final predictions
final_accuracy = accuracy_score(y_val, final_predictions)
print(f"Final Accuracy on Feature-wise Validation Set: {final_accuracy}") # 0.6346749226006192


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Final Accuracy on Feature-wise Validation Set: 0.6346749226006192


In [119]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Preprocessing helpers
text_imputer = SimpleImputer(strategy='constant', fill_value='') 
num_imputer = SimpleImputer(strategy='mean') 
scaler = StandardScaler()

# Initialize variables
text_columns = ['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23', 'Top24', 'Top25']  # Textual features
num_columns = ['Year', 'Month', 'Day', 'Quarter', 'Is_Weekend', 'Month_Start', 'Month_End', 'Day_of_Week']  

# Split the data
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(['Date', 'Label'], axis=1), df_train['Label'], test_size=0.2, random_state=42)

# Process Textual Features
text_features_train = np.hstack([vectorizer.fit_transform(text_imputer.fit_transform(X_train[col].astype(str))).toarray() for col in text_columns])
text_features_val = np.hstack([vectorizer.transform(text_imputer.transform(X_val[col].astype(str))).toarray() for col in text_columns])

# Process Numerical Features
num_features_train = num_imputer.fit_transform(scaler.fit_transform(X_train[num_columns].apply(pd.to_numeric, errors='coerce')))
num_features_val = num_imputer.transform(scaler.transform(X_val[num_columns].apply(pd.to_numeric, errors='coerce')))

# Train Textual Features Model
text_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=6, max_features='log2', criterion='entropy')
text_model.fit(text_features_train, y_train)

# Train Numerical Features Model
num_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=6, criterion='entropy')
num_model.fit(num_features_train, y_train)

# Combine Predictions
text_probs = text_model.predict_proba(text_features_val)[:, 1]
num_probs = num_model.predict_proba(num_features_val)[:, 1]
average_prob = (text_probs + num_probs) / 2

# Convert average probabilities to binary predictions
final_predictions = (average_prob > 0.5).astype(int)

# Evaluate the final predictions
final_accuracy = accuracy_score(y_val, final_predictions)
print(f"Final Accuracy on Combined Model: {final_accuracy}")


ValueError: Expected 2D array, got 1D array instead:
array=['funeral wwii veteran remembrance day successful online campaign man family'
 'hamas israel body street jose mercury' 'secular church state religion'
 ... 'iran strait hormuz' 'judge boy underage sex case'
 'france radical muslim'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.