In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, roc_auc_score

pd.set_option('display.max_columns', None)

In [2]:
#load data here
train = pd.read_csv('train_text_emb.csv')
val = pd.read_csv('val_text_emb.csv')
test = pd.read_csv('test_text_emb.csv')

In [3]:
train = train.drop(['tweet_text'],axis=1)
val = val.drop(['tweet_text'],axis=1)
test = test.drop(['tweet_text'],axis=1)

In [4]:
# train = pd.concat([train,val])

In [5]:
train.shape[0], test.shape[0]

(58954, 10000)

In [6]:
label_ = 'label'

X_train = train.drop(label_, axis=1)
y_train = train[label_]

X_test = test.drop(label_, axis=1)
y_test = test[label_]

In [7]:
X_train.shape, X_test.shape

In [8]:
X_train

In [9]:
X_test

In [20]:
# train_ori = pd.read_csv('train_text.csv')

# train_ori['id'] = train_ori['id'].astype(str)

# train_ori = train_ori[['id']]

# train_combine_id = pd.concat([train_ori, train], axis=1)

# train_combine_id.to_csv('train_emb_.csv', index=False)

In [12]:
# test_ori = pd.read_csv('test_text.csv')

# test_ori['id'] = test_ori['id'].astype(str)

# test_ori = test_ori[['id']]

# test_combine_id = pd.concat([test_ori, test], axis=1)

# test_combine_id.to_csv('test_emb_.csv', index=False)

In [10]:
# Define the parameter grid for grid search
# param_grid = {
#     'iterations': [100, 200, 300, 400, 500, 1000],
#     'depth': [3,4,5,6,7,8,9,10], 
#     'learning_rate': [0.01, 0.1, 0.2]
# }

param_grid = {
    'iterations': [100, 300, 500],
    'depth': [3,5,7], 
    #'learning_rate': [0.1, 0.2]
}

# Create a CatBoostClassifier
catboost_classifier = CatBoostClassifier(random_seed=42, verbose=0) #cat_features=cat_feat

# Create GridSearchCV
grid_search = GridSearchCV(catboost_classifier, param_grid, cv=5, scoring='roc_auc')

# Fit the model with grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

Best Parameters: {'depth': 5, 'iterations': 400, 'learning_rate': 0.1}


In [11]:
# Create a CatBoostClassifier
catboost_model = CatBoostClassifier(random_seed=42, 
                                    verbose=0, 
                                    iterations = best_params['iterations'],
                                    depth = best_params['depth'], 
                                    #learning_rate = best_params['learning_rate']
                                    ) #cat_features=cat_feat
# Train the model
catboost_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x16f044590>

In [12]:
#Performance Report Train Set
y_pred = catboost_model.predict(X_train)
y_prob = catboost_model.predict_proba(X_train)

auc_train = roc_auc_score(y_train, y_prob[:,1])
accuracy_train = accuracy_score(y_train, y_pred)
precision = precision_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)

print(f'AUC for train set: {auc_train}')
print(f'Accuracy for train set: {accuracy_train}')
print(f'Precision for train set: {precision}')
print(f'Recall for train set: {recall}')

AUC for train set: 0.7878063669165362
Accuracy for train set: 0.7216436813960033
Precision for train set: 0.7321106926477813
Recall for train set: 0.6990962254120149


In [13]:
#Performance Report Test Set
y_pred = catboost_model.predict(X_test)
y_prob = catboost_model.predict_proba(X_test)

auc_test = roc_auc_score(y_test, y_prob[:,1])
accuracy_test = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'AUC for test set: {auc_test}')
print(f'Accuracy for test set: {accuracy_test}')
print(f'Precision for test set: {precision}')
print(f'Recall for test set: {recall}')

AUC for test set: 0.6933409877336396
Accuracy for test set: 0.6476
Precision for test set: 0.6516115787312666
Recall for test set: 0.6346730653869226


In [14]:
# Save the model as a pickle file
with open('result/catboost_model_local.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)

In [10]:
#add label
X_test['label'] = y_test

#add final prediction from the model
X_test['final_pred'] = y_prob[:,1]

#add ids
X_test['id'] = test['id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['label'] = y_test


In [11]:
X_test

Unnamed: 0,text_prediction,image_prediction,label,final_pred,id
1,0.268957,0.503504,0,0.330705,1043075096544509952
2,0.888668,0.503504,0,0.748679,1115669983680454656
4,0.905938,0.503504,1,0.754194,1036092005741256705
5,0.304297,0.503504,1,0.345691,1037147555367280640
6,0.814584,0.503504,1,0.709645,1058255262727827456
...,...,...,...,...,...
9992,0.812633,0.503504,1,0.708393,1056325053862871040
9994,0.242986,0.503504,0,0.327795,1108959275609546754
9995,0.255713,0.503504,0,0.328791,1109972327897460736
9997,0.344334,0.503504,0,0.365967,1114945824851730434
