In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [None]:
train1 = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
test1 = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
train1.head()

In [None]:
test1.head()

In [None]:
train1.info()

In [None]:
print(train1.isnull().sum())

In [None]:
y = train1['target']
X = train1.drop(['id', 'target'], axis = 1)
X.head()

In [None]:
test = test1.drop(['id'], axis = 1)
test.head()

In [None]:
#Aplicamos un codificador a las columnas categoricas para transformarlas en valores del 0 al 4
cols = X.columns[:10]
for col in cols:
    encoder = LabelEncoder()
    encoder.fit(X[col])
    X[col] = encoder.fit_transform(X[col])
    test[col] = encoder.transform(test[col])
    
X.head()

In [None]:
test.head()

In [None]:
train_df = train1.drop(['id'], axis = 1)
fig, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(X.corr(), annot = True, 
            fmt=".2f", 
            cmap='coolwarm',
            cbar_kws={"shrink": .8})

In [None]:
# I want to thanks @masumrumi for sharing this amazing plot!
def plotting_3_chart(df, feature):
    ## Importing seaborn, matplotlab and scipy modules. 
    import seaborn as sns
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec
    from scipy import stats
    import matplotlib.style as style
    style.use('fivethirtyeight')

    ## Creating a customized chart. and giving in figsize and everything. 
    fig = plt.figure(constrained_layout=True, figsize=(12,8))
    ## creating a grid of 3 cols and 3 rows. 
    grid = gridspec.GridSpec(ncols=3, nrows=3, figure=fig)
    #gs = fig3.add_gridspec(3, 3)

    ## Customizing the histogram grid. 
    ax1 = fig.add_subplot(grid[0, :2])
    ## Set the title. 
    ax1.set_title('Histogram')
    ## plot the histogram. 
    sns.distplot(df.loc[:,feature], norm_hist=True, ax = ax1)

    # customizing the QQ_plot. 
    ax2 = fig.add_subplot(grid[1, :2])
    ## Set the title. 
    ax2.set_title('QQ_plot')
    ## Plotting the QQ_Plot. 
    stats.probplot(df.loc[:,feature], plot = ax2)

    ## Customizing the Box Plot. 
    ax3 = fig.add_subplot(grid[:, 2])
    ## Set title. 
    ax3.set_title('Box Plot')
    ## Plotting the box plot. 
    sns.boxplot(df.loc[:,feature], orient='h', ax = ax3 )

plotting_3_chart(train_df, 'target')

In [None]:
fig, ax = plt.subplots(figsize = (10, 5))
sns.distplot(train_df['target'])
ax.xaxis.grid(True)
ax.set(ylabel = "Valores")
ax.set(xlabel = "Target")
ax.set(title = "Distribuicion de target")
sns.despine(trim = True, left = True)
plt.show()

In [None]:
fig = plt.figure(figsize = (20,20))

for i in range(0,14):
    ax = fig.add_subplot(4,4,i+1)
    sns.distplot(train_df["cont"+str(i)], label='train')
    sns.distplot(test["cont"+str(i)], label='test')
    plt.legend()
    plt.title("cont"+str(i))
    plt.tight_layout()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 12)

print(X_train.shape)
print(X_val.shape)

In [None]:
params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "max_depth": 5,
    "eta": 0.05,
    "random_state": 751
}

In [None]:
d_train = xgb.DMatrix(X_train, label = y_train)
d_val = xgb.DMatrix(X_val, label = y_val)
d_test = xgb.DMatrix(test)

In [None]:
model_1 = xgb.train(params = params,
                    dtrain = d_train,
                    num_boost_round = 10000,
                    early_stopping_rounds = 20,
                    verbose_eval = 10,
                    evals = [(d_train, "train"), (d_val, "val")])

In [None]:
predict_1 = model_1.predict(d_val, ntree_limit = model_1.best_ntree_limit)
rmse_model_1 = np.sqrt(mean_squared_error(y_val, predict_1))
rmse_model_1

In [None]:
model_2 = lgb.LGBMRegressor(random_state = 100, 
                        n_estimators = 500, 
                        min_data_per_group = 5, 
                        boosting_type = 'gbdt',
                        num_leaves = 128, 
                        learning_rate = 0.005, 
                        subsample_for_bin = 200000, 
                        importance_type ='split', 
                        metric ='rmse', 
                        min_data_in_leaf = 50,
                        verbose = 10)

model_2.fit(X_train, y_train)

In [None]:
predict_2 = model_2.predict(X_val)
predict_2

In [None]:
rmse_model_2 = np.sqrt(mean_squared_error(y_val, predict_2))
rmse_model_2

In [None]:
test_predict = model_2.predict(test)
test_predict

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')
sub['target']=test_predict
sub.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c tabular-playground-series-feb-2021 -f submission.csv