In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import gc # сборщик мусора
import math
import matplotlib.pyplot as plt
import matplotlib.image as img
import seaborn as sns
%matplotlib inline

# Reading Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
train.head()

In [None]:
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
test.head()

# Removing trash

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = reduce_mem_usage(train)

In [None]:
test = reduce_mem_usage(test)

### Data dimension

In [None]:
train.shape

In [None]:
test.shape

### Data gaps

In [None]:
train.isna().sum().sum()

In [None]:
test.isna().sum().sum()

### Balancing classes

In [None]:
train['Cover_Type'].value_counts()

### Removing the most unrepresentative class

In [None]:
train.loc[train['Cover_Type'] == 5, 'Cover_Type']

In [None]:
train.drop(labels = [3403875], axis = 0, inplace = True)

### Data types

In [None]:
for i in train.dtypes:
      print(i)

### Correlations

In [None]:
plt.figure(figsize = (15,10))

sns.set(font_scale=1.4)

corr_matrix = train.corr()
#print(X.corr())
corr_matrix = np.round(corr_matrix, 2)
corr_matrix[np.abs(corr_matrix) < 0.1] = 0  # Проверьте, что будет если убрать маленькие корреляции

sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='coolwarm')

plt.title('Correlation matrix')
plt.show()

In [None]:
corr_with_target = train.corr().iloc[:-1, -1].sort_values(ascending=False)

plt.figure(figsize=(10, 8))

sns.barplot(x=corr_with_target.values, y=corr_with_target.index)

plt.title('Correlation with target variable')
plt.show()

# Training and model selection

In [None]:
X = train.drop(['Cover_Type'], axis=1)
y = train['Cover_Type']

### Normalization and standardization

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
# Fit on the training data
scaler.fit(X)
# Transform both the training and testing data
X_standar = scaler.transform(X)
test_standar = scaler.transform(test)
print(X_standar)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))
# Fit on the training data
scaler.fit(X)
# Transform both the training and testing data
X_norm = scaler.transform(X)
test_norm = scaler.transform(test)
print(X_norm)

### Balancing data

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN, KMeansSMOTE

os = SMOTE(random_state=0, sampling_strategy={1:1468136, 2:2262087, 3:500000, 4:50000, 6:50000, 7:100000})
os1 = ADASYN(random_state=0, n_neighbors=1)
os2 = KMeansSMOTE(random_state=0, k_neighbors=1, sampling_strategy='minority', kmeans_estimator=1)
# feature vector
X_train_full = train.drop(['Cover_Type'], axis=1) 
# target variable vector
y_train_full = train['Cover_Type']

column = X_train_full.columns

print("До балансировки")
print(X_train_full.shape)
print(y_train_full.value_counts())

# Let's apply the balancing algorithm
os_data_X, os_data_y = os.fit_resample(X_train_full, y_train_full)
os_data_X = pd.DataFrame(data=os_data_X, columns=column)
os_data_y = pd.DataFrame(data=os_data_y, columns=['Cover_Type'])

print('_'*100)
print("После балансировки")
print(os_data_X.shape)
print(os_data_y.value_counts())

### Choosing a model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #X_norm

### Support Vector Machines

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
y_pred_test = svc.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
svc_f = SVC()
svc_f.fit(X, y)
y_pred = svc_f.predict(test)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
y_pred_test = clf.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
clf_f = RandomForestClassifier(n_estimators=1000)
clf_f.fit(X, y)
y_pred = clf_f.predict(test)

### Extreme Gradient Boosting

In [None]:
import xgboost as xgb

xgb1 = xgb.XGBClassifier(max_depth = 15, min_child_weight = 0.07, learning_rate = 0.01, tree_method='gpu_hist')
xgb1.fit(X_train, y_train)
y_pred_test = xgb1.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
xgb_f = xgb.XGBClassifier(max_depth = 15, min_child_weight = 0.07, learning_rate = 0.01, tree_method='gpu_hist')
xgb_f.fit(X, y)
y_pred = xgb_f.predict(test)

### Catboost

In [None]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(task_type='GPU', n_estimators=40000, learning_rate=0.02, #
                              #l2_leaf_reg = 2.5, 
                              depth = 9)#500 0.1  
catboost.fit(X_train, y_train)
y_pred_test = catboost.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
#catboost_f = CatBoostClassifier(n_estimators=15000, learning_rate=0.01, #l2_leaf_reg = 2.5, 
                              #depth = 5)#500 0.1 task_type='GPU', 
#catboost_f.fit(X, y)
y_pred = catboost.predict(test)

### Logistic regression

In [None]:
from lightgbm import LGBMClassifier

logreg = LGBMClassifier(#max_depth=2,
                      max_depth=11,#11 
                      n_estimators=326,#326
                      random_state=53,
                      #objective = 'gamma',#gamma
                      # min_data_in_leaf = 27)#27)
)
logreg.fit(X_train, y_train)
y_pred_test = logreg.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
logreg_f = LGBMClassifier(#max_depth=2,
                      max_depth=11,#11 
                      n_estimators=326,#326
                      random_state=53,
                      #objective = 'gamma',#gamma
                      # min_data_in_leaf = 27)#27)
)
logreg_f.fit(X, y)
y_pred = logreg_f.predict(test)

### Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc.fit(X_train, y_train)
y_pred_test = gbc.predict(X_test)

print(accuracy_score(y_test, y_pred_test))

In [None]:
gbc_f = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
gbc_f.fit(X, y)
y_pred = gbc_f.predict(test)

### Neural network

In [None]:
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
y_train

In [None]:
y_train_n = y_train - 1
y_test_n = y_test - 1
y_train_n

In [None]:
y_train_n.unique()

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
y_train_n = np.array(y_train_n).reshape((-1, 1))

encoder = OneHotEncoder(categories='auto')
y_train_n = encoder.fit_transform(y_train_n).toarray()

y_train_n

In [None]:
y_test_n = np.array(y_test_n).reshape((-1, 1))

encoder = OneHotEncoder(categories='auto')
y_test_n = encoder.fit_transform(y_test_n).toarray()

y_test_n

In [None]:
network = models.Sequential()
network.add(layers.Dense(units=20, activation="relu", input_shape=(55,)))
network.add(layers.Dense(units=256, activation="relu"))
#network.add(layers.Dense(units=10, activation="relu"))
network.add(layers.Dense(units=1024, activation="relu"))
#network.add(layers.Dense(units=1, activation="tanh"))
network.add(layers.Dense(units=128, activation="relu"))
network.add(layers.Dense(units=6, activation="relu"))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(
    monitor='accuracy', 
    min_delta=0, 
    patience=100, 
    verbose=0,
    mode='min', 
    baseline=None, 
    restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='accuracy', 
    factor=0.2,
    patience=200,
    mode='min'
)

In [None]:
network.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [None]:
history = network.fit(
    X_train,
    y_train_n,
    epochs=30,
    verbose=1,
    batch_size=100,
    validation_data=(X_test, y_test_n),
        callbacks=[
            early_stopping,
            reduce_lr,
        ])

In [None]:
history = network.fit(
    X,
    y,
    epochs=18,
    verbose=1,
    batch_size=100,
        callbacks=[
            early_stopping,
            reduce_lr,
        ])

In [None]:
y_pred = network.predict(test)
y_pred

# One more neural network

In [None]:
import torch
import random
import numpy as np

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

In [None]:
X_train = X.to_numpy()
X_train = torch.from_numpy(X_train).float()

In [None]:
y_train.loc[train['Cover_Type'] > 4] -= 1
y_train['Cover_Type'] -= 1

In [None]:
y_train = y.to_numpy()
y_train = torch.from_numpy(y_train)

In [None]:
class ClassifierNet(torch.nn.Module):

    def __init__(self, n_hidden_neurons):
        super(ClassifierNet, self).__init__()

        self.fc1 = torch.nn.Linear(54, n_hidden_neurons)
        self.ac1 = torch.nn.ReLU()
        self.batch_norm1 = torch.nn.BatchNorm1d(n_hidden_neurons)

        self.fc2 = torch.nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.ac2 = torch.nn.ReLU()
        self.batch_norm2 = torch.nn.BatchNorm1d(n_hidden_neurons)

        self.fc3 = torch.nn.Linear(n_hidden_neurons, n_hidden_neurons)
        self.ac3 = torch.nn.ReLU()
        self.batch_norm3 = torch.nn.BatchNorm1d(n_hidden_neurons)

        self.fc4 = torch.nn.Linear(n_hidden_neurons, n_hidden_neurons // 2)
        self.ac4 = torch.nn.ReLU()
        self.batch_norm4 = torch.nn.BatchNorm1d(n_hidden_neurons // 2)

        self.fc5 = torch.nn.Linear(n_hidden_neurons // 2, n_hidden_neurons // 4)
        self.dr = torch.nn.Dropout(p=0.5)
        self.ac5 = torch.nn.ReLU()
        self.batch_norm5 = torch.nn.BatchNorm1d(n_hidden_neurons // 4)

        self.fc6 = torch.nn.Linear(n_hidden_neurons // 4, 7)
    

    def forward(self, x):
        x = self.fc1(x)
        x = self.ac1(x)
        x = self.batch_norm1(x)
        x = self.fc2(x)
        x = self.ac2(x)
        x = self.batch_norm2(x)
        x = self.fc3(x)
        x = self.ac3(x)
        x = self.batch_norm3(x)
        x = self.fc4(x)
        x = self.ac4(x)
        x = self.batch_norm4(x)
        x = self.fc5(x)
        x = self.ac5(x)
        x = self.batch_norm5(x)
        x = self.fc6(x)
        
        return x

net = ClassifierNet(128)

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
net = net.to(device) 

In [None]:
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1.0e-3)

In [None]:
batch_size = 500000

for epoch in range(1000):
    order = np.random.permutation(len(X_train))
    for start_index in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        
        batch_indexes = order[start_index:start_index+batch_size]
        
        X_batch = X_train[batch_indexes].to(device)
        y_batch = y_train[batch_indexes].to(device)
        
        preds = net.forward(X_batch)
        
        loss_value = loss(preds, y_batch)
        loss_value.backward()
        
        optimizer.step()
    if epoch % 100 == 0:
        print(epoch, (preds.argmax(dim=1) == y_batch).float().mean())

In [None]:
test_n = test.to_numpy()
test_n = torch.from_numpy(test_n).float()
test_n = test_n.to(device)

In [None]:
y_pred = net.forward(test_n)

In [None]:
y_pred = y_pred.argmax(dim=1)

In [None]:
y_pred = y_pred.cpu()

In [None]:
y_pred += 1
y_test[y_test > 4] += 1

### Forming an answer

In [None]:
indexs = [i for i in range(4000000, 5000000)]

In [None]:
y_pred = y_pred.tolist()

In [None]:
y_pred

In [None]:
for i in range(len(y_pred)):
    y_pred[i] = y_pred[i][0]  

In [None]:
d = {'id': indexs, 'Cover_Type': y_pred}
test_pred = pd.DataFrame(data=d)
test_pred

In [None]:
test_pred.to_csv(r'submission11.csv', index=None)