In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

<h1 id="heading3">
<span style="font-size:22px;">Loading Data</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading3" target ='_self'></a>
</h1>

In [None]:
df = pd.read_csv('../input/heart-attack-analysis-prediction-dataset/heart.csv')

In [None]:
df

<h1 id="heading">
<span style="font-size:22px;">Checking for NaN value.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading" target ='_self'></a>
</h1>

In [None]:
df.isna().sum()

<h1 id="heading2">
<span style="font-size:22px;">Outliers Treatment.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading2" target ='_self'></a>
</h1>

In [None]:
df.describe()

In [None]:
for i in range (0,df.shape[1]-1):
    sns.jointplot(x = df.output,y = df.iloc[:,i])

<span style="font-size:18px;color:crimson;">***Skewness***

In [None]:
for i in range (0,df.shape[1]):
    df.iloc[:,i].plot(kind = 'density')
    plt.show()

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
for i in range (0,df.shape[1]):
    df.iloc[:,i][df.iloc[:,i] > np.percentile(df.iloc[:,i],[99])[0] ] = df.iloc[:,i].median()


<span style="font-size:18px;color:crimson;">***Correlation***

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

In [None]:
#Converting age in different grops
for i in range(0,df.shape[0]):
    if 29 <= df.iloc[i,0] < 45 :
        df.iloc[i,0] = 0
    elif 45 <= df.iloc[i,0] < 60 :
        df.iloc[i,0] = 1
    elif 60 <= df.iloc[i,0] < 78 :
        df.iloc[i,0] = 2

In [None]:
x = df.drop(['output'],axis = 1)

In [None]:
y = df.output

In [None]:
#ss = StandardScaler()
#x = ss.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

<h1 id="heading4">
<span style="font-size:22px;">Random Forest.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading4" target ='_self'></a>
</h1>

In [None]:
rfc = RandomForestClassifier(n_estimators=100,min_samples_leaf=7,random_state=10)
rfc.fit(x_train,y_train)
(accuracy_score(y_test,rfc.predict(x_test)))

<h1 id="heading5">
<span style="font-size:18px;color:green;">Most of see random_state as a random number just for getting same results if rerun the same model but actually setting a good random_state can increase your model's accuracy. There isn't any trick to pick a random_state we just have to iterate it and hope for getting better results.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading5" target ='Note'></a>
</h1>

<h1 id="heading6">
<span style="font-size:22px;">Random Forest with variable random_state.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading6" target ='_self'></a>
</h1>

In [None]:
aa = []
for i in np.arange(0,100):
    rfc = RandomForestClassifier(n_estimators=100,min_samples_leaf=7,random_state=i)
    rfc.fit(x_train,y_train)
    aa.append(accuracy_score(y_test,rfc.predict(x_test)))

<span style="font-size:18px;color:crimson;">***Here you can see the variation of result just changing the random_state.***

In [None]:
min(aa),np.mean(aa),max(aa)

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(np.arange(0,100),aa)
plt.show()

<h1 id="heading7">
<span style="font-size:22px;">XGBoost</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading7" target ='_self'></a>
</h1>

In [None]:
xgb = XGBClassifier(
            max_depth= 4,
            alpha= 10,
            learning_rate= 0.2,
            n_estimators=100)
xgb.fit(x_train,y_train)
accuracy_score(y_test,xgb.predict(x_test))
#I am not going to use variable random_state for Xgboost because a single fitting take more time as compare to lightgbm.

<h1 id="heading8">
<span style="font-size:22px;">LightGBM with variable random_state.</span>
<a class="anchor-link" href="https://www.kaggle.com/alampalsingh/notebook004d896e8c#heading8" target ='_self'></a>
</h1>

In [None]:
h = []
for i in np.arange(0,100):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=i)
    lgb = LGBMClassifier(max_depth= 4,reg_alpha=10,learning_rate= 0.01)
    lgb.fit(x_train,y_train)
    acc = accuracy_score(y_test,lgb.predict(x_test))
    h.append(acc)

In [None]:
max(h)

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(np.arange(0,100),h)
plt.annotate("Max",(h.index(max(h)),max(h)),xytext=(15, 0.93),arrowprops=dict(arrowstyle='->', color='red',  linewidth=3, mutation_scale=50))
plt.show()

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#2BAE66FF;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">

<p style="padding: 10px;color:#FCF6F5FF;">This is the maximum accuracy that I have got. If you find this notebook helpful don't forget to upvote.</p>