-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict_from_model.py
149 lines (108 loc) · 6.64 KB
/
predict_from_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from datetime import datetime
import numpy
import pandas
from Data_ingestion.data_loader import data_loader
from Data_preprocessing_prediction.preprocessing import preprocessing
from Model_functions.model_functions_fileops import model_functions
from Application_logging.logger import app_logger
from File_operation_prediction.file_operation import file_operation
import pandas as pd
class predict_from_model:
def __init__(self):
# open log writer and file object
self.file_obj = open("Prediction_Logs/Prediction_main_log.txt", 'a+')
self.log_writer = app_logger()
# send log writer and file object for logging to other classes
self.data_loader_obj = data_loader(self.log_writer, self.file_obj)
self.preprocessor = preprocessing(self.log_writer, self.file_obj)
self.model_functions_obj = model_functions(self.log_writer, self.file_obj)
self.file_op_obj = file_operation()
self.prediction_input_file = 'Prediction_FileFromDB/InputFile.csv'
self.prediction_output_file = 'Prediction_Output_File/Predictions.csv'
self.categorical_feature_list_loc='preprocessing_data/categorical_features.csv'
self.columns_to_drop_list_loc = 'preprocessing_data/columns_to_remove.csv'
def get_prediction_from_model(self):
try:
# step 1 delete prediction file from last run
self.file_op_obj.delete_existing_file_create_new(self.prediction_output_file)
self.log_writer.log(self.file_obj, "Start of Prediction!!")
# step 2 get the data from prediction path
data = self.data_loader_obj.get_data(self.prediction_input_file)
# step 2 set categorical features as true if there are categorical features in data (optional)
are_categorical_features = False
categorical_features = self.file_op_obj.read_list(self.categorical_feature_list_loc)
if len(categorical_features)> 0:
are_categorical_features = True
if (are_categorical_features):
data = self.preprocessor.ensure_categorical_data_type(data, categorical_features)
# step2 preprocessing
# adding a derived variable
now = datetime.now()
year = now.date().year
data['vehicle_age'] = year - data['auto_year']
#'policy_no_index' will be used as index
data['policy_no_index'] = data['policy_number']
data.set_index(['policy_no_index'], inplace=True)
# step2.1 remove columns (no columns to remove)
cols_to_remove = self.file_op_obj.read_list(self.columns_to_drop_list_loc)
data = self.preprocessor.remove_columns(data, cols_to_remove)
# replacing '?' in data with NAN values
data.replace('?', numpy.NAN, inplace=True)
# step2.2 handle /impute null values if present
is_null_present, columns_with_null = self.preprocessor.is_null_present(data)
if is_null_present:
# check if null is in categorical variables then call categorical imputer .
# print(data[columns_with_null].dtypes.value_counts()['category'])
# if (data[columns_with_null].dtypes.value_counts()['category'] > 0):
data = self.preprocessor.impute_Categorical_values(data, columns_with_null)
# nulls are in non categorical columns then
# if categorical columns exist in data then encode them
if are_categorical_features:
data = self.preprocessor.encode_categorical_columns_from_mapping_file(data, categorical_features)
# then call KNN imputer
data = self.preprocessor.impute_missing_values_KNN(data)
else:
if are_categorical_features:
data = self.preprocessor.encode_categorical_columns_from_mapping_file(data, categorical_features)
# One hot encoding categorical features in data X
X_cat = self.preprocessor.one_hot_encode_cagtegorical_col(data, categorical_features)
# Scaling Numerical Columns in data X
X_num = self.preprocessor.scale_numerical_columns_from_training_Scaler(data, categorical_features)
# concat numerical & categorical data together
data = pandas.concat([X_num, X_cat], axis=1)
# step 4 load the model & predict from the model
kmeans = self.model_functions_obj.load_model('Kmeans')
# step 5 Clustering & prediction
clusters = kmeans.predict(data)
data['Cluster'] = clusters
list_of_clusters = data['Cluster'].unique()
for i in list_of_clusters:
# step 5.1 get the data for each
cluster_data = data[data['Cluster'] == i]
policy_number= list(cluster_data.index)
# step 5.3 drop the cluster column
cluster_data.drop(['Cluster'], axis=1, inplace=True)
# dont do any reset index here , with index values we can join with policy number columns to
#final zip up the values .
# step 5.4 find correct model name wrt cluster number
model_name = self.model_functions_obj.find_correct_model_for_cluster(i)
model = self.model_functions_obj.load_model(model_name)
# step 5.5 use the model to predict the label
predictions = []
results = list(model.predict(cluster_data)) # predict is method of sklearn
for res in results:
if res >= 0 and res <0.5:
predictions.append('N')
else:
predictions.append('Y')
# step 5.6 save result in data frame & append the column saved in step 5.2
dataframe = pd.DataFrame(list(zip(policy_number, predictions)),
columns=['policy_number', 'Prediction(Fraud/not Fraud)'])
# step 5.7 save the final result into a csv
# appends to the csv file
dataframe.to_csv(self.prediction_output_file, index=False, header=True, mode='a+')
self.log_writer.log(self.file_obj, "End of Prediction!!")
return self.prediction_output_file, dataframe.head().to_json(orient="records")
except Exception as e:
self.log_writer.log(self.file_obj, "Error Occurred during Prediction!!Error :: %s" % e)
raise e