# LightGBM in Action 

We will perform a LightGBM model to generate an accuracy prediction to submit in the [Tabular Playground Series - Mar 2021 Competition](https://www.kaggle.com/c/tabular-playground-series-mar-2021). We will use a LightGBM model due to its advantages such as speed and accuracy working wiht large datasets.

In [None]:
# Import packages
import numpy as np # Handling matrices
import pandas as pd # Data processing
import matplotlib.pyplot as plt # Plotting
import seaborn as sns # Plotting 
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder # Handling categorical data and normalization
from sklearn.model_selection import train_test_split # Split data in train and test
from sklearn.metrics import roc_auc_score,precision_score,confusion_matrix, accuracy_score, roc_curve, f1_score # Several useful metrics
import lightgbm as lgb # LightGBM

# Set matplotlib configuration
%matplotlib inline
plt.style.use('seaborn')

# 1) Review and analysis of data

In [None]:
# Import data
data = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
print("This dataset contains: {} rows and {} columns".format(data.shape[0],data.shape[1]))
data.head()

In [None]:
# Review the type of each feature
data.dtypes

In [None]:
# Count the type of features
data.dtypes.value_counts()
print('This dataset contains {} categorical features'.format(data.dtypes.value_counts()[0]))
print('This dataset contains {} numerical features'.format(data.dtypes.value_counts()[1]))

# Id and target are the unique integer features

In [None]:
# Analyse missing values
data.isna().sum()

# Do not have missing values

In [None]:
# Identify categorical features
cat = (data.dtypes == 'object')
cat_cols = list(cat[cat].index)
print(cat_cols)

# Create a handful of plots
for cols in cat_cols:
    plt.figure(figsize=(8,4));
    sns.countplot(x = data[cols]);

In [None]:
# Create a list of numerical_cols
numerical_cols = [cname for cname in data.columns if data[cname].dtype in ['float64']]

# Also, we can see how numerical features are related with the target
data[numerical_cols].hist(bins=15, figsize=(20, 14), layout=(7, 3));

We can see that our categorical and numeric features have different behaviours. We have categorical features with low and high number of classes, while our numerical feature are different distributions. 

In [None]:
# Analyse our target colum
data['target'].hist(bins=15, figsize=(12,6));

# We observe that our data is unbalanced. This is an important point.

# 2) Create a model

In [None]:
# Separate independent features of target
y = data['target']
X = data.drop(['id','target'],axis = 1)

In [None]:
# LightGBM also can handle categorical data directly We go to probe its inner method

# Transform categorical features into the appropriate type that is expected by LightGBM
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X[c] = X[c].astype('category')

In [None]:
# Divide data into training and validation subsets. We use parameters stratify to ensure our data is split maintain the proportion of output classes
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, 
                                                                test_size=0.2,random_state = 123,stratify = y)

In [None]:
# Print proportion of entire dataset
print("Proportion of classes in entire data: ")
print(100. * y.value_counts() / len(y),"\n")

# Print proportion of train and test sets 
print("Proportion of classes in train data: ")
print(100. * y_train.value_counts() / len(y_train),"\n")
print("Proportion of classes in valid data: ")
print(100. * y_valid.value_counts() / len(y_valid))

In [None]:
# Create the model 
d_train=lgb.Dataset(X_train, label=y_train) #Specifying the parameter
params={}
params['learning_rate']=0.05 # Learning rate 
params['boosting_type']='gbdt' # GradientBoostingDecisionTree
params['objective']='binary' # Binary target feature
params['metric']='auc' # Metric for binary classification
params['max_depth']=500, # Set depth
params['bagging_fraction'] = 0.6,
params['force_row_wise'] = True, # Need to the model
params['unbalance'] =True, # To consider an unbalanced dataset
params['num_leaves'] = 100
clf=lgb.train(params,d_train,200) # Train the model on 200 epocs

# Prediction on the valid set
y_pred=clf.predict(X_valid)

In [None]:
# Function to plot ROC curve
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

fpr, tpr, thresholds = roc_curve(y_valid, y_pred)
plot_roc_curve(fpr, tpr)

In [None]:
# Create Confusion Matrix
pred_class = y_pred > 0.5
pred_class = pred_class.astype(int)
cm = confusion_matrix(y_valid, pred_class)
print("Confusion matrix: \n",cm,"\n")

# Get accuracy
accuracy = round(accuracy_score(y_valid,pred_class),4)
print("Accuracy: {}".format(accuracy),"\n")

# Get f1 score (it is required on the Task 1 of this dataset)
f1 = f1_score(y_valid,pred_class)
print("F1: {}".format(f1),"\n")


In [None]:
# See the feature importance
importance_feature = pd.DataFrame({'Value':clf.feature_importance(),'Feature':clf.feature_name()}).sort_values(by="Value", ascending=False)

# Create a plot
plt.figure(figsize=(20, 10))
sns.barplot(x = 'Value',y = 'Feature',data = importance_feature);
plt.title("Importance feature");

# 3) Use model in test set

In [None]:
# Load test data
test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")
test.head()

# Remove id
X_test = test.drop("id",axis = 1)

# Transform categorical features into the appropriate type that is expected by LightGBM
for c in X_test.columns:
    col_type = X_test[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        X_test[c] = X_test[c].astype('category')


In [None]:
# Prediction on the valid set
test_pred=clf.predict(X_test)

# 4) Write results

In [None]:
# Create submission file
output = test[['id']].copy()
output['target'] = pd.Series(test_pred, index=output.index)
output.head()

In [None]:
# write csv
output.to_csv("submissionv1.csv",index = False)