In [43]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder , OneHotEncoder
from sklearn.impute import SimpleImputer

In [47]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

class DataPreprocessor:
    def __init__(self, data_path ):
        self.data_path = data_path
        self.df = None
        self.num_cols = None
        self.cat_cols = None
        self.num_imputer = SimpleImputer(strategy='mean')
        self.cat_imputer = SimpleImputer(strategy='most_frequent')
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()
        self.encoders = {}
        
    def load_data(self):
        self.df = pd.read_csv(self.data_path)

     
        
    def impute_missing_values(self):
        self.num_cols = self.df.select_dtypes(include=['int64', 'float64']).columns
        self.cat_cols = self.df.select_dtypes(include=['object']).columns
        self.df[self.num_cols] = self.num_imputer.fit_transform(self.df[self.num_cols])
        self.df[self.cat_cols] = self.cat_imputer.fit_transform(self.df[self.cat_cols])
        
    def scale_numeric_values(self):
        self.df[self.num_cols] = self.scaler.fit_transform(self.df[self.num_cols])
        
    def encode_categorical_values(self):
        for col in self.cat_cols:
            encoder = LabelEncoder()
            self.df[col] = encoder.fit_transform(self.df[col])
            self.encoders[col] = encoder
            
    def save_preprocessed_data(self, output_path):
        self.df.to_csv(output_path, index=False)


        
    def transform_test_data(self, test_data_path):
        test_df = pd.read_csv(test_data_path)
        test_df[self.num_cols] = self.num_imputer.transform(test_df[self.num_cols])
        test_df[self.cat_cols] = self.cat_imputer.transform(test_df[self.cat_cols])
        test_df[self.num_cols] = self.scaler.transform(test_df[self.num_cols])
        for col in self.cat_cols:
            encoder = self.encoders[col]
            test_df[col] = encoder.fit_transform(test_df[col])
        return test_df

In [76]:
preprocessor = DataPreprocessor('/content/dataset.csv')

In [77]:
preprocessor.load_data()
preprocessor.impute_missing_values()
preprocessor.scale_numeric_values()
preprocessor.encode_categorical_values()
preprocessor.save_preprocessed_data('preprocessed_data.csv')

In [78]:
train_data  = pd.read_csv('/content/preprocessed_data.csv')

In [79]:
train_columns = train_data.columns

In [80]:
train_columns

Index(['MSISDN', 'TRANSACTION_DATE', 'TRANSACTION_TYPE', 'TONECODE',
       'TONENAME', 'ARTIST', 'VENDOR', 'GENRE', 'LANGUAGE', 'DOWNLOAD_MEDIUM',
       'AUTO_RENEWAL_DATE', 'AUTO_RENEWAL_STATUE', 'AMOUNT', 'CHARGED_AMOUNT',
       'SUBSCRIPTION_DATE', 'TRANSACTION_STATUS', 'FAIL_REASON',
       'GIFT_COPY_NUMBER', 'UPLOAD_DATE', 'TIME_KEY', 'MSISDN_ENCR_INT',
       'BATCH_ID', 'INSERT_DATE_TIME', 'UPDATE_DATE_TIME'],
      dtype='object')

In [81]:

model_IF = IsolationForest(contamination=float(0.1),random_state=42)

In [83]:

model_IF.fit(train_data)



In [84]:
train_data.head(5)


Unnamed: 0,MSISDN,TRANSACTION_DATE,TRANSACTION_TYPE,TONECODE,TONENAME,ARTIST,VENDOR,GENRE,LANGUAGE,DOWNLOAD_MEDIUM,...,SUBSCRIPTION_DATE,TRANSACTION_STATUS,FAIL_REASON,GIFT_COPY_NUMBER,UPLOAD_DATE,TIME_KEY,MSISDN_ENCR_INT,BATCH_ID,INSERT_DATE_TIME,UPDATE_DATE_TIME
0,0,34311,1,-0.612567,2256,759,30,35,6,5,...,5033,1,3,0.0,7,1.291219,-0.306742,7,7,7
1,0,34312,1,-0.670202,1460,1036,47,27,6,9,...,46014,1,3,0.0,7,1.291219,-0.185252,7,7,7
2,0,34313,1,-0.669974,1593,1321,7,35,6,7,...,42201,1,3,0.0,7,1.291219,-0.34871,7,7,7
3,0,34314,1,-0.610241,3473,759,30,35,6,0,...,30577,1,3,0.0,7,1.291219,0.579068,7,7,7
4,0,34315,3,-0.062233,334,971,15,35,6,7,...,46834,1,3,0.0,7,1.291219,0.59783,7,7,7


In [86]:
 predictions = model_IF.predict(train_data)

In [87]:
 predictions

array([1, 1, 1, ..., 1, 1, 1])