<a href="https://colab.research.google.com/github/sergekamanzi/Formative2-Data-Preprocessing/blob/main/Formative_2_Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Part 1: Data Augmentation on CSV Files**

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the dataset
data1 = pd.read_csv('/content/customer_transactions.csv', low_memory=False)
data1.head()

Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3


In [3]:
data1.shape

(150, 6)

In [4]:
# Cleaning column names
data1.columns = data1.columns.str.strip()

In [5]:
# Renames purchase_amount to TransactionAmount for better readability
column_mapping = {
    'purchase_amount': 'TransactionAmount'
}
data1.rename(columns=column_mapping, inplace=True)

In [7]:
# Ensuring 'TransactionAmount' column exists
if 'TransactionAmount' not in data1.columns:
    print("Column 'TransactionAmount' not found. Available columns:", data1.columns)
    raise KeyError("TransactionAmount column missing from dataset")

In [8]:
# Handling missing values
for column in data1.select_dtypes(include=['number']).columns:
    if data1[column].isnull().sum() > 0:
        median_imputer = SimpleImputer(strategy='median')
        data1[column] = median_imputer.fit_transform(data1[[column]])

In [9]:
%%capture
data1.isnull().sum()

In [10]:
# anonymizing transaction amounts while keeping them realistic
def add_noise(series, noise_level=0.01):
    return series + noise_level * np.random.randn(len(series)) * series.std()

data1['TransactionAmount'] = add_noise(data1['TransactionAmount'])

In [11]:
# Applying Log Transformation for Skewed Data
skewed_columns = ['TransactionAmount']
for col in skewed_columns:
    data1[col] = np.log1p(data1[col])

In [12]:
data1.head()

Unnamed: 0,customer_id_legacy,transaction_id,TransactionAmount,purchase_date,product_category,customer_rating
0,151,1001,6.014158,2024-01-01,Sports,2.3
1,192,1002,5.81034,2024-01-02,Electronics,4.2
2,114,1003,6.093434,2024-01-03,Electronics,2.1
3,171,1004,5.546827,2024-01-04,Clothing,2.8
4,160,1005,4.145162,2024-01-05,Clothing,1.3


In [13]:
# Generating Synthetic Data/duplicated data
def generate_synthetic_data(df, num_samples=1000):
    synthetic_data = df.sample(n=num_samples, replace=True, random_state=42).copy()
    synthetic_data['TransactionAmount'] = add_noise(synthetic_data['TransactionAmount'], noise_level=0.02)
    synthetic_data['transaction_id'] = synthetic_data['transaction_id'].astype(str) + '_synthetic'
    return synthetic_data

synthetic_transactions = generate_synthetic_data(data1, num_samples=500)
data1 = pd.concat([data1, synthetic_transactions], ignore_index=True)

In [14]:
data1.shape

(650, 6)

In [15]:
# Saving the Processed Data
data1.to_csv('/content/customer_transactions_augmented.csv', index=False)

**Part 2: Merging Datasets with Transitive Properties**

**Part 3: Data Consistency and Quality Checks**