In [None]:
pip install catboost



In [None]:
pip install optuna



In [None]:
!pip install dask[dataframe]



In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import pandas as pd

In [None]:
data=pd.read_csv('/content/drive/MyDrive/Project Datasets/kiva_train.csv')
data.head()

Unnamed: 0,id,country,en,gender,loan_amount,nonpayment,sector,status
0,1,Ecuador,<h4>Business Description</h4> \r\n <p> Don Mau...,M,825,lender,Food,1
1,2,Dominican Republic,Rosa Iris is a brilliant entrepreneur who sell...,F,450,partner,Retail,0
2,3,Kenya,Sirote is married with six children. Two of he...,F,600,lender,Agriculture,1
3,4,Kenya,David Mwangi Kimani is 33 years old and marri...,M,650,lender,Food,1
4,5,Dominican Republic,Nilda is a very persistent woman who has learn...,F,325,partner,Food,0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5454 entries, 0 to 5453
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5454 non-null   int64 
 1   country      5454 non-null   object
 2   en           5454 non-null   object
 3   gender       5454 non-null   object
 4   loan_amount  5454 non-null   int64 
 5   nonpayment   5454 non-null   object
 6   sector       5454 non-null   object
 7   status       5454 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 341.0+ KB


In [None]:
data.describe()

Unnamed: 0,id,loan_amount,status
count,5454.0,5454.0,5454.0
mean,2727.5,718.217822,0.493766
std,1574.578515,587.237614,0.500007
min,1.0,25.0,0.0
25%,1364.25,325.0,0.0
50%,2727.5,600.0,0.0
75%,4090.75,950.0,1.0
max,5454.0,5000.0,1.0


In [None]:
import pandas as pd
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from scipy.sparse import hstack
from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity

data['text_sentiment'] = data['en'].apply(get_sentiment)
data['country_loan_interaction'] = data['country'] + "_" + data['loan_amount'].astype(str)
data['gender_sector_interaction'] = data['gender'] + "_" + data['sector']
X = data.drop(['id', 'status'], axis=1)
y = data['status']


cat_features = ['country', 'gender', 'nonpayment', 'sector', 'country_loan_interaction', 'gender_sector_interaction']
num_features = ['loan_amount', 'text_sentiment']


num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


text_transformer = TfidfVectorizer(max_features=50)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_train_text = text_transformer.fit_transform(X_train['en'])
X_test_text = text_transformer.transform(X_test['en'])


X_train_main = preprocessor.fit_transform(X_train)
X_test_main = preprocessor.transform(X_test)

X_train_combined = hstack([X_train_main, X_train_text])
X_test_combined = hstack([X_test_main, X_test_text])


lightgbm_model = lgb.LGBMClassifier(n_estimators=412, learning_rate=0.0242, max_depth=6, subsample=0.6521)
xgb_model = XGBClassifier(n_estimators=412, learning_rate=0.0242, max_depth=6, subsample=0.6521, use_label_encoder=False, eval_metric='logloss')
catboost_model = CatBoostClassifier(iterations=412, learning_rate=0.0242, depth=6, subsample=0.6521, verbose=0)  # CatBoost has similar hyperparameters but set `verbose=0` to suppress output

# ensemble
stacked_model = StackingClassifier(
    estimators=[
        ('lightgbm', lightgbm_model),
        ('xgboost', xgb_model),
        ('catboost', catboost_model)
    ],
    final_estimator=LogisticRegression(),
    cv=3
)


stacked_model.fit(X_train_combined, y_train)


y_pred_proba = stacked_model.predict_proba(X_test_combined)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_proba)
print("Test AUC for Stacked Model with CatBoost: ", test_auc)

test_data = pd.read_csv('/content/drive/MyDrive/Project Datasets/kiva_test.csv')


test_data['text_sentiment'] = test_data['en'].apply(get_sentiment)
test_data['country_loan_interaction'] = test_data['country'] + "_" + test_data['loan_amount'].astype(str)
test_data['gender_sector_interaction'] = test_data['gender'] + "_" + test_data['sector']


X_competition_main = preprocessor.transform(test_data.drop(['id', 'en'], axis=1))
X_competition_text = text_transformer.transform(test_data['en'])
X_competition_combined = hstack([X_competition_main, X_competition_text])


test_ids = test_data['id']
y_competition_proba = stacked_model.predict_proba(X_competition_combined)[:, 1]


submission = pd.DataFrame({
    'id': test_ids,
    'status': y_competition_proba
})

# CSV
submission.to_csv('/content/drive/MyDrive/Project Datasets/submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

[LightGBM] [Info] Number of positive: 2170, number of negative: 2193
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13128
[LightGBM] [Info] Number of data points in the train set: 4363, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497364 -> initscore=-0.010543
[LightGBM] [Info] Start training from score -0.010543


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 1446, number of negative: 1462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12965
[LightGBM] [Info] Number of data points in the train set: 2908, number of used features: 125
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497249 -> initscore=-0.011004
[LightGBM] [Info] Start training from score -0.011004
[LightGBM] [Info] Number of positive: 1447, number of negative: 1462
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12951
[LightGBM] [Info] Number of data points in the train set: 2909, number of used features: 124
[LightGBM] [Info] [b

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Test AUC for Stacked Model with CatBoost:  0.9818187326636685
Submission file saved as 'submission.csv'
