# Tabular Playground Series - OCT 21

### 1. Import All necessary Libraries

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# Visual exploratory libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Machine learning related libraries
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


### 2. Read the train, test, submission files to respective dataframes

In [None]:
train_df = pd.read_csv("/kaggle//input//tabular-playground-series-oct-2021//train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-oct-2021/test.csv")


In [None]:
test_df.head()

### 3. Memory reduction function on Data Frames to help avoid Memory exeeding error

In [None]:
# This Cell Code is adopted  from "https://www.kaggle.com/hrshuvo/tps-oct-21-xgb-kfold?scriptVersionId=76104876&cellId=7" -- Due credits to Original Author

# this function will help to reduce momory 
# data will be samller with the same value

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

### 3. Tasks to explore
* How many Rows and columns -- Rows - 1000000, Col 287
* What are dtypes - 240 Floats, 46 Integer types
* Remove id Column - Soon after reading the dataframe
* What is target column ?? -- Binary (0, 1) -- Binary Classification problem
* How many classes - 9 Classes
* Any Missing Values - No Missing Values
* Any Duplicated Rows ?? - No Duplicated Rows
* Any Duplicate Columns ?? - No Duplicated columns
* Remove the id column from the train_df - done
* What kind of classification can be used here ??. -- XGBoost

In [None]:
train_df.shape

In [None]:
train_df.dtypes.value_counts()

In [None]:
print(list(train_df.columns))

In [None]:
#Examine the target column and its distribution graphically
train_df.target.value_counts()

In [None]:
fig = plt.figure(figsize = (20,5))
sns.countplot(x = 'target', data = train_df)
plt.show()

### Comment : Target data is evenly distributed 

In [None]:
# Any Missing Values - No Missing Values
sum(train_df.isna().sum() >0)

In [None]:
train_df.duplicated().sum()

### 4. Data Prep for Machine learning

In [None]:
# Remove id, target columns 
X = train_df.drop(["id","target"], axis = 'columns', inplace = False).values
y = train_df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.30, random_state=85)
#Making testdf data as an array of values
test_submit = test_df.drop("id", axis=1).values

### 5. XGBOOST Classification Model Building

In [None]:
import xgboost as xgb
#from xgboost import XGBClassifier
xgb_params = {'predictor': 'gpu_predictor',
              'tree_method':'gpu_hist',
              'gpu_id': 0,
              'objective':'binary:logistic'
             }

model = xgb.XGBClassifier(**xgb_params)
model.fit(X_train, y_train)

### 6. predicting the X_test Data 

In [None]:
pred_test = model.predict(X_test)
pred_test 


### 7. Model Accuracy

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred_test)*100)

### 8. Predicting the test.csv file data with the model built

In [None]:
# predict_probabilities are  taken for submission
y_result= model.predict_proba(test_submit)[:,1]
print(y_result)

### 9. Preparing the submission file

In [None]:
submission_result = pd.DataFrame(y_result,columns= ['target'])
submission_result['id'] = test_df['id']
submission_result.head()

In [None]:
submission_result.to_csv("submission.csv",index=False)