In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.impute import SimpleImputer
from sklearn.semi_supervised import SelfTrainingClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings("ignore")

In [12]:
train = pd.read_csv("train.csv")
labels = pd.read_csv("train_labels.csv")
test = pd.read_csv("test.csv")
train.head()

Unnamed: 0,Id,gene_1,gene_3,gene_5,gene_7,gene_8,gene_9,gene_10,gene_11,gene_13,...,gene_20634,gene_20635,gene_20636,gene_20637,gene_20638,gene_20639,gene_20640,gene_20641,gene_20642,Class
0,sample_664,0.160738,-0.327348,-0.144638,0.196493,-1.105093,0.309926,-0.177461,-1.124182,-0.459826,...,-1.611378,-1.108411,-0.670719,-1.739299,0.476467,1.136071,-0.576601,-1.275518,-0.508678,1.0
1,sample_215,-0.771173,0.885819,-0.234209,0.273139,0.132208,-0.249541,0.005817,-0.631647,,...,0.247812,0.144035,0.148776,-1.373208,0.099245,0.391993,0.573363,0.322198,6.022439,0.0
2,sample_343,-0.169258,1.908618,0.165008,-0.562826,0.19972,0.128036,2.34845,2.425346,-0.933545,...,1.133065,0.965014,1.873753,-0.005167,-0.223091,0.782868,-0.562787,-0.471593,-0.763284,4.0
3,sample_707,-0.947912,0.111177,-0.153179,0.837412,0.185467,-0.066223,-0.267734,0.674365,-0.076086,...,0.022339,0.326506,-0.333964,0.228595,-0.245309,0.478564,0.273364,1.756369,-0.2662,1.0
4,sample_621,-0.335741,0.515251,0.32544,-0.842387,-0.500415,0.48424,-0.438587,-0.874562,,...,-1.516812,-1.430622,-0.664933,-0.75341,,0.375521,-0.536705,-0.52385,0.22256,4.0


In [13]:
labels.head()

Unnamed: 0,Id,Class
0,sample_664,1
1,sample_215,0
2,sample_343,4
3,sample_707,1
4,sample_621,4


In [14]:
train = train.merge(labels, on="Id", how="left") #merging id class in train and labels dataset
train.head()

Unnamed: 0,Id,gene_1,gene_3,gene_5,gene_7,gene_8,gene_9,gene_10,gene_11,gene_13,...,gene_20635,gene_20636,gene_20637,gene_20638,gene_20639,gene_20640,gene_20641,gene_20642,Class_x,Class_y
0,sample_664,0.160738,-0.327348,-0.144638,0.196493,-1.105093,0.309926,-0.177461,-1.124182,-0.459826,...,-1.108411,-0.670719,-1.739299,0.476467,1.136071,-0.576601,-1.275518,-0.508678,1.0,1.0
1,sample_215,-0.771173,0.885819,-0.234209,0.273139,0.132208,-0.249541,0.005817,-0.631647,,...,0.144035,0.148776,-1.373208,0.099245,0.391993,0.573363,0.322198,6.022439,0.0,0.0
2,sample_343,-0.169258,1.908618,0.165008,-0.562826,0.19972,0.128036,2.34845,2.425346,-0.933545,...,0.965014,1.873753,-0.005167,-0.223091,0.782868,-0.562787,-0.471593,-0.763284,4.0,4.0
3,sample_707,-0.947912,0.111177,-0.153179,0.837412,0.185467,-0.066223,-0.267734,0.674365,-0.076086,...,0.326506,-0.333964,0.228595,-0.245309,0.478564,0.273364,1.756369,-0.2662,1.0,1.0
4,sample_621,-0.335741,0.515251,0.32544,-0.842387,-0.500415,0.48424,-0.438587,-0.874562,,...,-1.430622,-0.664933,-0.75341,,0.375521,-0.536705,-0.52385,0.22256,4.0,4.0


In [15]:
 #drop the duplicate column class in train dataset

train = train.drop(columns=["Class_x"], errors='ignore')

train = train.rename(columns={"Class_y": "Class"})

In [16]:
gene_features = [col for col in train.columns if col.startswith("gene_")] #select gene feature columns

In [17]:
#splitting dataset into labelled and unlabelled

labeled = train[train["Class"].notna()]
unlabeled = train[train["Class"].isna()]

X_labeled = labeled[gene_features].values
y_labeled = labeled["Class"].astype(int).values

X_unlabeled = unlabeled[gene_features].values
y_unlabeled = -1 * np.ones(X_unlabeled.shape[0])

In [18]:
X_test = test[gene_features].values

#for missing values used simple imputer using mean
imputer = SimpleImputer(strategy="mean")
X_combined = np.vstack([X_labeled, X_unlabeled])
x_combinedImputed = imputer.fit_transform(X_combined) #imputed and labeled gene features
X_testImputed = imputer.transform(X_test)

In [19]:
#scale features
scaler = StandardScaler()
X_combinedScaled = scaler.fit_transform(x_combinedImputed) #scaled
X_testScaled = scaler.transform(X_testImputed)

In [20]:
#feature selection from all avl gene data (selecting 1000)

selector = SelectKBest(score_func=mutual_info_classif, k=1000)
y_combined = np.concatenate([y_labeled, y_unlabeled])

In [21]:
X_combinedSelected = selector.fit_transform(X_combinedScaled, y_combined) #selected
X_testSelected = selector.transform(X_testScaled)

In [22]:
#self training classifier with lightbgm

base = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=12,
    num_leaves=64,
    random_state=42,
    class_weight='balanced'
)

self_train = SelfTrainingClassifier(
    base_estimator=base,
    threshold=0.92,
    verbose=True
)

In [23]:
self_train.fit(X_combinedSelected, y_combined) #train on data

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48063
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 1000
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438


In [24]:
y_pred = self_train.predict(X_testSelected)

submission = pd.read_csv("sample_submission.csv")
submission["Class"] = y_pred.astype(int)
submission.to_csv("submission2.csv", index=False)
