In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, matthews_corrcoef

ModuleNotFoundError: No module named 'xgboost'

In [None]:
train_df = pd.read_csv('/data/train.csv', index_col = 'id')
test_df = pd.read_csv('/data/test.csv', index_col = 'id')
sample_df = pd.read_csv('/data/sample_submission.csv')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3116945 entries, 0 to 3116944
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   class                 object 
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), object(18)
memory usage: 523.2+ MB


In [None]:
train_df.head()

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.8,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a


In [None]:
train_df.isnull().sum()

class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2077964 entries, 3116945 to 5194908
Data columns (total 20 columns):
 #   Column                Dtype  
---  ------                -----  
 0   cap-diameter          float64
 1   cap-shape             object 
 2   cap-surface           object 
 3   cap-color             object 
 4   does-bruise-or-bleed  object 
 5   gill-attachment       object 
 6   gill-spacing          object 
 7   gill-color            object 
 8   stem-height           float64
 9   stem-width            float64
 10  stem-root             object 
 11  stem-surface          object 
 12  stem-color            object 
 13  veil-type             object 
 14  veil-color            object 
 15  has-ring              object 
 16  ring-type             object 
 17  spore-print-color     object 
 18  habitat               object 
 19  season                object 
dtypes: float64(3), object(17)
memory usage: 332.9+ MB


In [None]:
test_df.isnull().sum()

cap-diameter                  7
cap-shape                    31
cap-surface              446904
cap-color                    13
does-bruise-or-bleed         10
gill-attachment          349821
gill-spacing             839595
gill-color                   49
stem-height                   1
stem-width                    0
stem-root               1838012
stem-surface            1321488
stem-color                   21
veil-type               1971545
veil-color              1826124
has-ring                     19
ring-type                 86195
spore-print-color       1899617
habitat                      25
season                        0
dtype: int64

In [None]:
missing_percentage = (train_df.isnull().sum() / len(train_df)) * 100
print(missing_percentage)

class                    0.000000
cap-diameter             0.000128
cap-shape                0.001283
cap-surface             21.528227
cap-color                0.000385
does-bruise-or-bleed     0.000257
gill-attachment         16.809280
gill-spacing            40.373988
gill-color               0.001829
stem-height              0.000000
stem-width               0.000000
stem-root               88.452732
stem-surface            63.551362
stem-color               0.001219
veil-type               94.884350
veil-color              87.936970
has-ring                 0.000770
ring-type                4.134818
spore-print-color       91.425482
habitat                  0.001444
season                   0.000000
dtype: float64


In [None]:
copy_train = train_df.copy()

In [None]:
categorical_cols = train_df.select_dtypes(include=['object']).columns
copy_train[categorical_cols]  = copy_train[categorical_cols].fillna('unknown')

In [None]:
copy_train.isnull().sum()

class                   0
cap-diameter            4
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [None]:
numerical_cols = train_df.select_dtypes(exclude=['object']).columns
for col in numerical_cols:
     mode_value = copy_train[col].mode()[0]
     copy_train[col] = copy_train[col].fillna(mode_value)

In [None]:
# copy_train = copy_train.drop(['spore-print-color', 'veil-color', 'stem-root', 'stem-surface', 'veil-type', 'gill-spacing', 'cap-surface', 'gill-attachment'], axis = 1)

In [None]:
copy_train.isnull().sum()/len(copy_train) * 100

class                   0.0
cap-diameter            0.0
cap-shape               0.0
cap-surface             0.0
cap-color               0.0
does-bruise-or-bleed    0.0
gill-attachment         0.0
gill-spacing            0.0
gill-color              0.0
stem-height             0.0
stem-width              0.0
stem-root               0.0
stem-surface            0.0
stem-color              0.0
veil-type               0.0
veil-color              0.0
has-ring                0.0
ring-type               0.0
spore-print-color       0.0
habitat                 0.0
season                  0.0
dtype: float64

In [None]:
copy_train.head()

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.8,f,s,u,f,a,c,w,4.51,...,unknown,unknown,w,unknown,unknown,f,f,unknown,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,unknown,y,o,unknown,unknown,t,z,unknown,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,unknown,s,n,unknown,unknown,f,f,unknown,l,w
3,e,3.88,f,y,g,f,s,unknown,g,4.16,...,unknown,unknown,w,unknown,unknown,f,f,unknown,d,u
4,e,5.85,x,l,w,f,d,unknown,w,3.37,...,unknown,unknown,w,unknown,unknown,f,f,unknown,g,a


In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    copy_train.drop(['class'], axis=1), copy_train['class'], test_size=0.2, random_state=42)

In [None]:
categorical_cols = X_train.select_dtypes(include=['object']).columns

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_cat_encoded = encoder.fit_transform(X_train[categorical_cols].astype(str))

X_val_cat_encoded = encoder.transform(X_val[categorical_cols].astype(str))

X_train_encoded = X_train.copy()

X_val_encoded = X_val.copy()

X_train_encoded[categorical_cols] = X_train_cat_encoded

X_val_encoded[categorical_cols] = X_val_cat_encoded


In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

In [None]:
X_train_encoded.head()

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1252551,2.58,58.0,71.0,52.0,19.0,40.0,24.0,41.0,2.87,5.69,32.0,49.0,41.0,20.0,20.0,4.0,16.0,26.0,23.0,0.0
1799166,1.83,41.0,46.0,53.0,8.0,66.0,24.0,41.0,5.36,2.7,32.0,49.0,41.0,20.0,20.0,4.0,16.0,26.0,34.0,0.0
1936146,5.22,63.0,50.0,53.0,8.0,44.0,40.0,57.0,7.32,7.41,32.0,49.0,38.0,20.0,20.0,4.0,16.0,26.0,34.0,0.0
1464811,4.52,45.0,69.0,65.0,8.0,62.0,26.0,57.0,5.85,12.74,32.0,36.0,53.0,20.0,20.0,4.0,16.0,26.0,23.0,0.0
767639,6.18,45.0,69.0,52.0,8.0,40.0,24.0,57.0,6.33,10.29,13.0,49.0,41.0,20.0,20.0,4.0,16.0,26.0,33.0,3.0


In [None]:
y_train_encoded

array([1, 1, 1, ..., 1, 1, 0])

In [None]:
model = XGBClassifier(
    n_estimators=100,    
    learning_rate=0.1,   
    max_depth=5,         
    random_state=42
)

model.fit(X_train_encoded,y_train_encoded)

In [None]:
y_val_pred = model.predict(X_val_encoded)

In [None]:
matthews_corrcoef(y_val_encoded, y_val_pred)

0.9579167148263661

In [None]:
X_test = test_df.copy()

In [None]:
X_test[categorical_cols] = X_test[categorical_cols].fillna('unknown')
for col in numerical_cols:
     mode_value = X_test[col].mode()[0]
     X_test[col] = X_test[col].fillna(mode_value)

In [None]:
X_test.isnull().sum()

cap-diameter            0
cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-height             0
stem-width              0
stem-root               0
stem-surface            0
stem-color              0
veil-type               0
veil-color              0
has-ring                0
ring-type               0
spore-print-color       0
habitat                 0
season                  0
dtype: int64

In [None]:
X_test_cat_encoded = encoder.transform(X_test[categorical_cols])
X_test_encoded = X_test.copy()
X_test_encoded[categorical_cols] = X_test_cat_encoded 

In [None]:
X_test_encoded.head()

Unnamed: 0_level_0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3116945,8.64,63.0,69.0,52.0,19.0,66.0,40.0,55.0,11.13,17.12,13.0,49.0,51.0,19.0,21.0,16.0,17.0,26.0,23.0,0.0
3116946,6.9,55.0,67.0,53.0,8.0,66.0,24.0,57.0,1.27,10.75,32.0,49.0,41.0,20.0,20.0,4.0,16.0,26.0,23.0,0.0
3116947,2.0,41.0,46.0,52.0,8.0,66.0,24.0,41.0,6.18,3.14,32.0,49.0,41.0,20.0,20.0,4.0,16.0,26.0,23.0,1.0
3116948,3.47,63.0,67.0,52.0,8.0,62.0,24.0,41.0,4.98,8.51,32.0,49.0,51.0,20.0,13.0,16.0,38.0,26.0,23.0,2.0
3116949,6.17,63.0,47.0,65.0,8.0,58.0,40.0,57.0,6.73,13.7,32.0,49.0,53.0,20.0,22.0,16.0,34.0,26.0,23.0,2.0


In [None]:
result = model.predict(X_test_encoded)
result

array([0, 1, 1, ..., 1, 0, 0])

In [None]:
decoded_result = le.inverse_transform(result)
decoded_result

array(['e', 'p', 'p', ..., 'p', 'e', 'e'], dtype=object)

In [None]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   id      int64 
 1   class   object
dtypes: int64(1), object(1)
memory usage: 31.7+ MB


In [None]:
sample_df['class'] = decoded_result

In [None]:
sample_df.to_csv('submission.csv',index=False)