In [None]:
# # For Data reading 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns


# For Feature Scaling & Feature Importance
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesRegressor

# For model building & scoreing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


# others
import warnings
warnings.filterwarnings('ignore')

#### Read Dataset

In [None]:
train_df = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/train_data.csv')
test_df = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/test_data.csv')
train_dict = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv')

#### Let's Encoded dependent Feature ("Stay" column)

In [None]:
# Let's find unique values of dependent feature
train_df["Stay"].unique()

In [None]:
# Let's create a dictionary for dependent feature
encode = {
    '0-10' : 1, '11-20' : 2, '21-30' : 3, '31-40' : 4, '41-50' : 5, '51-60' : 6, '61-70' : 7, '71-80' : 8,
    '81-90' : 9, '91-100' : 10, 'More than 100 Days' : 11
}
train_df['Stay'] = train_df['Stay'].map(encode)

### Handle Missing value of features

In [None]:
# Let's check missing values
print('Train Dataset:::::::::::::::')
print(train_df.isnull().sum())
print("=========================================")
print('Test Dataset::::::::::::::::')
print(test_df.isnull().sum())

In [None]:
# Find features of missing values 
def NaNFeature(df):
    nan_feature = [n for n in df.columns if df[n].isnull().sum()>=1]
    return nan_feature

In [None]:
# Let's fill missing values of train detaset 
nan_features_train = NaNFeature(train_df)
for fillnan in nan_features_train:
    train_df[fillnan].fillna(train_df[fillnan].mode()[0], inplace=True)

In [None]:
# Let's fill missing values of test detaset 
nan_features_test = NaNFeature(test_df)
for fillnan in nan_features_test:
    test_df[fillnan].fillna(test_df[fillnan].mode()[0], inplace=True)

In [None]:
# Lets check missing values percentage
print('Train Dataset:::::::::::::::')
print(np.round(train_df.isnull().sum() * 100 / len(train_df), 4))
print("=========================================")
print('Test Dataset:::::::::::::::')
print(np.round(test_df.isnull().sum() * 100 / len(test_df), 4))

### Handle Categorical Features

In [None]:
# Let's Find out categorical features through a function
def CatFeatures(df):
    features = [feature for feature in df.columns if df[feature].dtypes == "O"]
    return features

In [None]:
# categorical features of train dataset
cat_features_train = CatFeatures(train_df)
cat_features_train

In [None]:
# Let's check unique value of categorical features of train data
for i in cat_features_train:
    print(train_df[i].unique())

In [None]:
# categorical features of test dataset
cat_features_test = CatFeatures(test_df)
cat_features_test

In [None]:
for i in cat_features_test:
    print(test_df[i].unique())

In [None]:
# # Let's create a function to handle categorical features 
def CatToNumaric():
    # Handle categorical feature of train dataset
    for n in cat_features_train:
        num_data = dict(zip(train_df[n].unique(), range(len(train_df[n].unique()))))
        train_df[n] = train_df[n].map(num_data) # or train_df[n].replace(num_data, inplace=True)
        
    # Handle categorical features of test dataset
    for n in cat_features_test:
        num_data = dict(zip(test_df[n].unique(), range(len(test_df[n].unique()))))
        test_df[n] = test_df[n].map(num_data) # or test_df[n].replace(num_data, inplace=True)

In [None]:
# Let's check features data types
CatToNumaric()
print('Train Dataset:::::::::::::::')
print(train_df.dtypes)
print("=====================================")
print('Test Dataset:::::::::::::::')
print(test_df.dtypes)

#### Clean unused features

In [None]:
# Let's see the train dictionary data to drop un necessary features
train_dict

In [None]:
# Lets drop features those are necessary so much
def DropFeatures(df):
    drop_features = {'case_id', 'Hospital_code', 'Hospital_type_code', 'patientid'}
    df.drop(drop_features, axis=1, inplace=True)
    return df

In [None]:
# Show train dataset
train_data = DropFeatures(train_df)
train_data.head()

In [None]:
# Show test dataset
test_data = DropFeatures(test_df)
test_data.head()

### Feature Scaling (MinMax Scaler)

In [None]:
# create X_train & X_test for feature scaling 
X_train = train_data.iloc[: , :-1]
X_test = test_data

# y_train (depended feature)
y_train = train_data.iloc[: , -1]

In [None]:
# create function for scaling X_ data 
def FeatureScaler(df):
    min_max = MinMaxScaler()
    df = pd.DataFrame(min_max.fit_transform(df), columns=df.columns)
    return df

In [None]:
# Let's show final train dataset
X_train_final = FeatureScaler(X_train)
X_train_final.head()

In [None]:
# Let's show final test dataset
X_test_final = FeatureScaler(X_test)
X_test_final.head()

### Find Feature Importance
    Find best 10 features from datasets

In [None]:
# Let's call Extra Trees Regressor function
feature_imp = ExtraTreesRegressor()
feature_imp.fit(X_train_final, y_train)
# Let's show the list of feature importance
feature_imp.feature_importances_

In [None]:
# Let's show a plot of ten (10) features
feature_importance = pd.Series(feature_imp.feature_importances_, index=X_train_final.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.show()

### Create Model with Random Forest Classifier

In [None]:
# Create model
stay_predict = RandomForestClassifier()
stay_predict.fit(X_train_final, y_train)

#### Model Testing

In [None]:
# Let's test the model
y_test = stay_predict.predict(X_test_final)
y_test

In [None]:
# For submission file we need 'case_id' so read sample_submission file
sample_sub_df = test_df = pd.read_csv('../input/av-healthcare-analytics-ii/healthcare/sample_sub.csv')

#### Decode Prediction data

In [None]:
predection_df = pd.DataFrame()
predection_df['case_id'] = sample_sub_df['case_id'] 
predection_df['Stay'] = y_test

decode_prediction = { 1 : '0-10', 2 : '11-20', 3 : '21-30', 4 : '31-40', 5 : '41-50', 6 : '51-60', 7 : '61-70'
            ,8 : '71-80', 9 : '81-90', 10 : '91-100', 11 : 'More than 100 Days'}

predection_df['Stay'] = predection_df['Stay'].map(decode_prediction)
predection_df.head()

### Scoring & Validation

In [None]:
# Model score
stay_predict.score(X_train_final, y_train)

In [None]:
# Cross Validation 
score = cross_val_score(stay_predict, X_train_final, y_train.ravel(), cv=10)
score.mean()

#### Create a submission File

In [None]:
submission = predection_df.copy()
submission.head()

#### Thanks
Wish to get comments from all.

Source Code on github-  https://github.com/sheikhmasudrana/ML_Practice/tree/master/Healthcare%20Analytics(stay%20days%20prediction)