In [1]:
import pandas as pd
import numpy as np

In [2]:
partd = pd.read_csv("../labelled_data/part_D/features/partd_features_data.csv")

In [3]:
partd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112168 entries, 0 to 1112167
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1112168 non-null  int64  
 1   PRSCRBR_NPI       1112168 non-null  int64  
 2   Prscrbr_Type_src  1112168 non-null  object 
 3   Tot_Benes         1112168 non-null  float64
 4   Tot_Clms          1112168 non-null  int64  
 5   Tot_30day_Fills   1112168 non-null  float64
 6   Tot_Day_Suply     1112168 non-null  int64  
 7   Tot_Drug_Cst      1112168 non-null  float64
 8   Fraud             1112168 non-null  int64  
 9   FraudType         1112168 non-null  object 
dtypes: float64(3), int64(5), object(2)
memory usage: 84.9+ MB


In [4]:
partd["Fraud"].value_counts()

Fraud
0    1111941
1        227
Name: count, dtype: int64

In [5]:
partd["Prscrbr_Type_src"].value_counts()

Prscrbr_Type_src
S    1008148
T     104020
Name: count, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
partd['Prscrbr_Type_src'] = label_encoder.fit_transform(partd['Prscrbr_Type_src'])

In [7]:
partd["Prscrbr_Type_src"].value_counts()

Prscrbr_Type_src
0    1008148
1     104020
Name: count, dtype: int64

In [8]:
partd["FraudType"].value_counts()

FraudType
0            1111941
1128a1            71
1128b4            50
1128a4            36
1128a3            25
1128a1            24
1128b7             8
1128a2             4
1128b1             3
1128a3             3
1128Aa             1
BRCH SA            1
1128a2             1
Name: count, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
partd['FraudType'] = label_encoder.fit_transform(partd['FraudType'])

In [10]:
partd["FraudType"].value_counts()

FraudType
0     1111941
2          71
10         50
8          36
6          25
3          24
11          8
4           4
9           3
7           3
1           1
12          1
5           1
Name: count, dtype: int64

In [11]:
val_partd = partd.sample(frac=0.1,random_state=0)
fraud_zero_indices = val_partd[val_partd["Fraud"] == 0].index
rows_to_remove = 100000

if len(fraud_zero_indices) >= rows_to_remove:
    random_indices_to_remove = np.random.choice(fraud_zero_indices, size=rows_to_remove, replace=False)
    val_partd = val_partd.drop(random_indices_to_remove)
else:
    print("Not enough rows with fraud=0 to remove.")

In [12]:
val_partd["Fraud"].value_counts()

Fraud
0    11184
1       33
Name: count, dtype: int64

In [13]:
fraud_zero_indices = val_partd.index
partd = partd.drop(fraud_zero_indices,axis=0)

In [14]:
partd["Fraud"].value_counts()

Fraud
0    1100757
1        194
Name: count, dtype: int64

In [15]:
val_partd["Fraud"].value_counts()

Fraud
0    11184
1       33
Name: count, dtype: int64

In [16]:
val_partd.to_csv("../labelled_data/part_D/features/validation_partd.csv")

## Resampling for Fraud 

In [17]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

X = partd.drop(['Fraud','FraudType'], axis=1)
y = partd['Fraud']
# Create a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)
partd_rus = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Fraud')], axis=1)

In [18]:
partd_rus["Fraud"].value_counts()

Fraud
0    194
1    194
Name: count, dtype: int64

In [19]:
partd_rus

Unnamed: 0.1,Unnamed: 0,PRSCRBR_NPI,Prscrbr_Type_src,Tot_Benes,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Fraud
455002,513331,1407926819,0,170.0,535,676.100000,13627,12162.93,0
345156,389273,1306977806,0,31.0,64,64.333333,1464,3055.64,0
28585,32243,1023292844,0,15.0,18,18.000000,145,93.56,0
760891,858623,1689606865,0,12.0,17,17.000000,215,407.47,0
998128,1126458,1891842761,0,293.0,1130,1857.500000,48945,129441.19,0
...,...,...,...,...,...,...,...,...,...
1095152,1235918,1982662722,0,413.0,1330,3068.500000,91567,89058.19,1
1097158,1238215,1982743589,0,55.0,263,423.600000,12370,66723.17,1
1099571,1240977,1982888152,1,122.0,1040,2311.366667,67529,100240.86,1
1109884,1252558,1992860977,0,90.0,2132,2296.933333,64849,1444745.29,1


In [20]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

X = partd.drop(['Fraud','FraudType'], axis=1)
y = partd['Fraud']
# Create a RandomUnderSampler instance
rus = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)
partd_ros = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Fraud')], axis=1)

In [21]:
partd_ros["Fraud"].value_counts()

Fraud
0    1100757
1    1100757
Name: count, dtype: int64

In [22]:
partd_ros

Unnamed: 0.1,Unnamed: 0,PRSCRBR_NPI,Prscrbr_Type_src,Tot_Benes,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Fraud
0,0,1003000126,0,106.0,324,369.800000,8621,20606.08,0
1,1,1003000142,0,228.0,1992,2145.666667,60953,79803.65,0
2,2,1003000167,0,43.0,57,57.700000,554,327.34,0
3,5,1003000423,0,67.0,218,404.933333,10242,19778.34,0
4,6,1003000480,0,26.0,48,76.466667,1929,7416.39,0
...,...,...,...,...,...,...,...,...,...
2201509,877474,1699886200,0,92.0,373,653.000000,18237,12874.00,1
2201510,1013367,1801860507,0,353.0,2262,3447.700000,88774,125113.79,1
2201511,483503,1386655991,0,78.0,167,206.000000,4626,14205.65,1
2201512,115176,1093164311,0,105.0,461,500.633333,12360,20087.75,1


In [23]:
from imblearn.combine import SMOTEENN

X = partd.drop(['Fraud','FraudType'], axis=1)
y = partd['Fraud']
# Create a RandomUnderSampler instance
rus = SMOTEENN(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)
partd_smote = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Fraud')], axis=1)

In [24]:
partd_smote["Fraud"].value_counts()

Fraud
1    1087910
0    1074003
Name: count, dtype: int64

In [25]:
partd_smote

Unnamed: 0.1,Unnamed: 0,PRSCRBR_NPI,Prscrbr_Type_src,Tot_Benes,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,Fraud
0,0,1003000126,0,106.000000,324,369.800000,8621,20606.080000,0
1,1,1003000142,0,228.000000,1992,2145.666667,60953,79803.650000,0
2,2,1003000167,0,43.000000,57,57.700000,554,327.340000,0
3,5,1003000423,0,67.000000,218,404.933333,10242,19778.340000,0
4,6,1003000480,0,26.000000,48,76.466667,1929,7416.390000,0
...,...,...,...,...,...,...,...,...,...
2161908,544321,1437043633,0,24.078133,49,52.392148,1508,3112.681219,1
2161909,248463,1195156179,0,113.776623,1479,2939.901472,85151,187640.665985,1
2161910,999724,1790780496,0,404.280422,7940,11588.867112,314310,848560.522106,1
2161911,672883,1540991967,0,251.957151,4509,8041.917669,233873,319868.546902,1


## Resampling -- FraudType

In [26]:
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd

X = partd.drop(['Fraud','FraudType'], axis=1)
y = partd['FraudType']
# Create a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)
partd_rus_type = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='FraudType')], axis=1)

In [27]:
partd_rus_type["FraudType"].value_counts()

FraudType
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
Name: count, dtype: int64

In [28]:
partd_rus_type

Unnamed: 0.1,Unnamed: 0,PRSCRBR_NPI,Prscrbr_Type_src,Tot_Benes,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,FraudType
455002,513331,1407926819,0,170.0,535,676.1,13627,12162.93,0
12673,14290,1013087741,0,88.0,1025,1046.7,28957,41958.56,1
147768,166599,1134221351,0,348.0,1995,4385.333333,127810,67039.0,2
591288,667232,1538129127,0,23.0,127,178.333333,4645,15085.27,3
737338,832020,1669455168,0,17.0,32,83.533333,2454,3327.6,4
727207,820582,1659430163,0,32.0,145,352.9,10319,6788.08,5
567395,640292,1508968835,0,192.0,2254,3581.833333,106609,198297.18,6
851953,961594,1760666416,0,14.0,41,43.033333,1121,3551.61,7
303020,341648,1275505919,0,11.0,41,43.1,1207,2133.29,8
24927,28058,1023087293,0,59.0,75,77.0,791,554.31,9


In [29]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

X = partd.drop(['Fraud','FraudType'], axis=1)
y = partd['FraudType']
# Create a RandomUnderSampler instance
rus = RandomOverSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)
partd_ros_type = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='FraudType')], axis=1)

In [30]:
partd_ros_type["FraudType"].value_counts()

FraudType
0     1100757
8     1100757
10    1100757
1     1100757
9     1100757
2     1100757
6     1100757
4     1100757
3     1100757
11    1100757
12    1100757
5     1100757
7     1100757
Name: count, dtype: int64

In [31]:
partd_ros_type

Unnamed: 0.1,Unnamed: 0,PRSCRBR_NPI,Prscrbr_Type_src,Tot_Benes,Tot_Clms,Tot_30day_Fills,Tot_Day_Suply,Tot_Drug_Cst,FraudType
0,0,1003000126,0,106.0,324,369.800000,8621,20606.08,0
1,1,1003000142,0,228.0,1992,2145.666667,60953,79803.65,0
2,2,1003000167,0,43.0,57,57.700000,554,327.34,0
3,5,1003000423,0,67.0,218,404.933333,10242,19778.34,0
4,6,1003000480,0,26.0,48,76.466667,1929,7416.39,0
...,...,...,...,...,...,...,...,...,...
14309836,483503,1386655991,0,78.0,167,206.000000,4626,14205.65,12
14309837,483503,1386655991,0,78.0,167,206.000000,4626,14205.65,12
14309838,483503,1386655991,0,78.0,167,206.000000,4626,14205.65,12
14309839,483503,1386655991,0,78.0,167,206.000000,4626,14205.65,12


In [32]:
partd_ros.to_csv("../labelled_data/part_D/features/partd_ros_features.csv")
partd_rus.to_csv("../labelled_data/part_D/features/partd_rus_features.csv")
partd_smote.to_csv("../labelled_data/part_D/features/partd_smote_features.csv")

In [33]:
partd_ros_type.to_csv("../labelled_data/part_D/features/partd_ros_type_features.csv")
partd_rus_type.to_csv("../labelled_data/part_D/features/partd_rus_type_features.csv")