In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')
train = pd.read_csv("../input/100k-records/100k.csv", parse_dates=['hour'])
train.head()

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
col=train.columns.tolist()
col.remove('hour')
col

In [None]:
train[col]=train[col].astype('object')
train.info()

In [None]:
train.describe(include = 'object')

In [None]:
cat_features=[i for i in train.columns if train.dtypes[i]=='object']
cat_features=[e for e in cat_features if e not in ['id','click']]
cat_features

In [None]:
unique=train[cat_features].nunique().sort_values(ascending=True).to_frame()
unique =unique.rename(columns= {0: 'Number of Unique Values'})
unique.index.name = 'Feature'
unique=unique.reset_index()
unique=unique[unique.Feature!='click']
unique

In [None]:
high_cardinality_features=unique.loc[(unique['Number of Unique Values'] >= 10)].Feature.unique().tolist()
high_cardinality_features

In [None]:
low_cardinality_features=unique.loc[(unique['Number of Unique Values'] < 10)].Feature.unique().tolist()
low_cardinality_features

In [None]:
train.hour.describe()

In [None]:
round(train['click'].value_counts()*100/len(train),2)

In [None]:
train.reset_index(inplace=True,drop=True) 
train.head()

In [None]:
for i in low_cardinality_features:
    print(i)
    print(round(train[i].value_counts()*100/len(train),2),'\n')

In [None]:
!pip install pycaret[full]

In [None]:
from pycaret.classification import *

In [None]:
ad= setup(data=train,
          target = "click",  session_id=42,
          normalize=True,
          train_size = 0.8, # training over 80% of available data
          handle_unknown_categorical = True, 
          remove_multicollinearity = True, #drop one of the two features that are highly correlated with each other
          ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.    
          ignore_features=['id'],
          categorical_features=cat_features,
          high_cardinality_features=high_cardinality_features,
          combine_rare_levels = True,
          date_features=['hour'],
          unknown_categorical_method= 'most_frequent',
          transformation = True
         )

In [None]:
model_ada = create_model('ada')
model_ada

In [None]:
plot_model(model_ada,plot = 'confusion_matrix',use_train_data=True)

In [None]:
plot_model(model_ada,plot = 'confusion_matrix')

In [None]:
predict_model(model_ada)

In [None]:
model_ada_Accuracytuned = tune_model(model_ada,optimize = 'Accuracy')
model_ada_Accuracytuned

In [None]:
evaluate_model(model_ada_Accuracytuned)

In [None]:
plot_model(model_ada_Accuracytuned,plot = 'confusion_matrix',use_train_data=True)

In [None]:
plot_model(model_ada_Accuracytuned,plot = 'confusion_matrix')

In [None]:
predict_model(model_ada_Accuracytuned)

In [None]:
final_ada = finalize_model(model_ada_Accuracytuned)
final_ada

In [None]:
evaluate_model(final_ada)

In [None]:
plot_model(final_ada,plot = 'confusion_matrix',use_train_data=True)

In [None]:
plot_model(final_ada,plot = 'confusion_matrix')

In [None]:
predict_model(final_ada)

In [None]:
types_train = {
    'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.dtype(int),
    'C1': np.dtype(str),
    'banner_pos': np.dtype(str),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(str),
    'device_conn_type': np.dtype(str),
    'C14': np.dtype(str),
    'C15': np.dtype(str),
    'C16': np.dtype(str),
    'C17': np.dtype(str),
    'C18': np.dtype(str),
    'C19': np.dtype(str),
    'C20': np.dtype(str),
    'C21':np.dtype(str)
}

In [None]:
unseen_data = pd.read_csv('../input/avazu-ctr-prediction-with-random-50k-rows/50krecords.csv', parse_dates = ['hour'], dtype=types_train)
unseen_data.head()

In [None]:
unseen_data.hour.describe()

In [None]:
data=unseen_data.drop(['click'],axis=1)
data.head()

In [None]:
predictions = predict_model(final_ada, data = data)
predictions.head()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(unseen_data.click,predictions.Label))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(unseen_data.click,predictions.Label))

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
array=confusion_matrix(unseen_data.click,predictions.Label)
df_cm = pd.DataFrame(array, range(2), range(2))
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g') # font size
plt.show()

In [None]:
# save a model
save_model(final_ada, './CTRPredictionmodel')