In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_squared_error, r2_score, make_scorer, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')


In [3]:
df = pd.read_csv(r"C:\Users\Saidabrorkhon\Downloads\Telegram Desktop\train.csv")

In [3]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
for col in df.columns:
  if df[col].dtype == 'object':
    df[col].fillna(df[col].mode()[0], inplace=True)
  else:
    df[col].fillna(df[col].mean(), inplace=True)

In [6]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [5]:
cat_col = df.select_dtypes(include='object').columns
le = LabelEncoder()
cardinality = df[cat_col].nunique()

for col in cat_col:
  if cardinality[col] >= 4:
    df[col] = le.fit_transform(df[col])
  else:
    df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    int64  
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    int64  
 10  Sex_male     891 non-null    int64  
 11  Embarked_Q   891 non-null    int64  
 12  Embarked_S   891 non-null    int64  
dtypes: float64(2), int64(11)
memory usage: 90.6 KB


In [6]:
# keraksiz columnlarni drop qilish
df.drop(columns=['PassengerId'], inplace=True)

In [7]:
x = df.drop(columns=['Survived'])
y = df['Survived']

scaler = StandardScaler()

x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)
df = pd.concat([x_scaled, y.reset_index(drop=True)], axis=1)

In [12]:
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,0.827377,-1.310217,0.737695,-0.592481,0.432793,-0.473674,0.918966,-0.502445,-0.281881,0.585954,0
1,-1.566107,-0.99141,-1.355574,0.638789,0.432793,-0.473674,1.282625,0.786845,1.161545,-1.942303,1
2,0.827377,-0.357685,-1.355574,-0.284663,-0.474545,-0.473674,1.646283,-0.488854,-0.281881,0.585954,1
3,-1.566107,-0.672604,-1.355574,0.407926,0.432793,-0.473674,-1.442322,0.42073,0.057748,0.585954,1
4,0.827377,-1.67179,0.737695,0.407926,-0.474545,-0.473674,0.664904,-0.486337,-0.281881,0.585954,0


In [8]:
x = df.drop(columns=['Survived'])
y = df['Survived']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [17]:
model = LGBMClassifier()

In [18]:
param_dist={
  'max_depth':[None, 5, 10, 15],
  'min_samples_split':[4,6,8],
  'min_samples_leaf':[2,4,6],
  'max_features':[None,'sqrt','log2']
}

acc_scorer = make_scorer(accuracy_score, greater_is_better=True)
f1_scorer = make_scorer(f1_score)

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    refit='r2',
    scoring={'mse':acc_scorer,'r2':f1_scorer},
    n_jobs=-1
        )

In [19]:
lgb_model = random_search.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000634 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 723
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [20]:
y_pred = lgb_model.predict(x_val)



In [21]:
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print(acc)
print(f1)

0.7888888888888889
0.7076923076923077


In [22]:
results = {
  
  'model': ['LightGBM'],
  'mse': [acc],
  'r2': [f1]
}

result = pd.DataFrame(results)
result

Unnamed: 0,model,mse,r2
0,LightGBM,0.788889,0.707692
