#MLJAR Automated Machine Learning

The mljar-supervised is an Automated Machine Learning Python package that works with tabular data. It is designed to save time for a data scientist. It abstracts the common way to preprocess the data, construct the machine learning models, and perform hyper-parameters tuning to find the best model. It is no black-box as you can see exactly how the ML pipeline is constructed (with a detailed Markdown report for each ML model).

https://github.com/mljar/mljar-supervised#automated-machine-learning-rocket

![](https://pbs.twimg.com/media/EeQGDN4XoAArvSC.png)twitter.com

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/covid19-in-angola/summary.csv")
pd.set_option('display.max_columns', None)
df.tail()

#Codes by Abid Ali Awan https://www.kaggle.com/kingabzpro/automljar-golden-features/notebook

In [None]:
!pip install -q -U git+https://github.com/mljar/mljar-supervised.git@master
!pip install -q -U matplotlib==3.1.3 

In [None]:
from supervised import AutoML

In [None]:
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})
import os
from tqdm.notebook import tqdm
import gc
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns; sns.set()
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score,mean_squared_error,auc
from sklearn import metrics

In [None]:
TARGET_COL = "Total de Recuperados"
df = pd.read_csv("../input/covid19-in-angola/summary.csv")
print(df.shape)
test = pd.read_csv("../input/covid19-in-angola/summary.csv")
print(test.shape)
df['label']='train'
test['label']='test'
frames = [df,test]
join_df = pd.concat(frames, keys=['x', 'y'])
assert len(join_df) == len(df) + len(test)
lst = join_df.isna().sum()/len(join_df)
p = pd.DataFrame(lst)
p.reset_index(inplace=True)
p.columns = ['a','b']
low_count = p[p['b']>0.8]
todelete=low_count['a'].values
join_df.drop(todelete,axis=1,inplace=True)
join_df.head()

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
join_df.drop(['Novos Recuperados'],inplace=True,axis=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = join_df.select_dtypes(include=numerics)
numeric_cols = newdf.columns

# Need to do column by column due to memory constraints
categorical_cols =  ['Data','label']
for i, v in tqdm(enumerate(categorical_cols)):
    join_df[v] = join_df[v].fillna(join_df[v].value_counts().index[0])
for i, v in tqdm(enumerate([numeric_cols])):
    join_df[v] =join_df.groupby(['Data','label'], sort=False)[v].apply(lambda x: x.fillna(x.mean()))
join_df[categorical_cols].isna().sum()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# In loop to minimize memory use
for i, v in tqdm(enumerate(categorical_cols)):
    join_df[v] = OrdinalEncoder(dtype="int").fit_transform(join_df[[v]])
    

gc.collect()

train = join_df[join_df['label']=="train"]
predict = join_df[join_df['label']=='test']

train.reset_index(inplace=True)
train.drop(['level_0','level_1','label'],inplace=True,axis =1 )

predict.reset_index(inplace=True)
predict.drop(['level_0','level_1','Total de Recuperados','label'],inplace=True,axis=1)
features = train.columns
num_feature = [col for col in features if col not in categorical_cols]

In [None]:
num_feature = [col for col in features if col not in categorical_cols and train[col].dtype != 'object']
drop_columns=[]
corr = train[num_feature].corr()
# Drop highly correlated features 
columns = np.full((corr.shape[0],), True, dtype=bool)

for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >=0.999 :
            if columns[j]:
                columns[j] = False
                print('FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(train[num_feature].columns[i] , train[num_feature].columns[j], corr.iloc[i,j]))
        elif corr.iloc[i,j] <= -0.995:
            if columns[j]:
                columns[j] = False

#Below, it's not suppose to be empty drop columns. 

In [None]:
drop_columns = train[num_feature].columns[columns == False].values
print('drop_columns',len(drop_columns),drop_columns)

In [None]:
train.drop(drop_columns,inplace=True,axis =1 )
predict.drop(drop_columns,inplace=True,axis =1 )
train[TARGET_COL].value_counts()/len(train)

In [None]:
df = df.rename(columns={'Total de Recuperados':'totrecup'})

#Only errors till the End.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
df_majority = train[train['Total de Recuperados']==0]
df_minority = train[train['Total de Recuperados']==1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=83798,    # to match majority class
                                 random_state= 303) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.totrecup.value_counts()
train = df_upsampled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
     train[[c for c in train if TARGET_COL != c]], train[TARGET_COL], test_size=0.20, random_state=42)
print(X_train.shape,X_test.shape)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [None]:
X_train.head()

#Fast AutoMlJar

In [None]:
X=train[[c for c in train if TARGET_COL != c]]
y=train[TARGET_COL]
automl = AutoML(mode="Compete",total_time_limit=4*3600)
automl.fit(X, y)

#My predict is empty.

In [None]:
predict

In [None]:
Final=automl.predict_all(predict)

In [None]:
test[TARGET_COL] = Final['prediction_1']
test[["Novos Casos","Total de Recuperados"]].to_csv("submission.csv",index=False)

In [None]:
#predictions = automl.predict_all(X_valid)

In [None]:
#print("Test MSE:", mean_squared_error(y_valid, predictions['prediction_1'],squared=False))

In [None]:
#automl.report()