In [0]:
%sh
ls /team5/data

In [1]:
%sh
pip install pandasql


## File : 
flatFile.csv


In [3]:
%python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandasql as ps
import sqlite3


df = pd.read_csv('/flatFile.csv', delimiter='\t', low_memory=False)  



In [4]:
%python
# Data Inspection Function
print("===== COMPREHENSIVE DATA INSPECTION =====\n")

# 1. Basic Dataset Information
print("1. DATASET OVERVIEW")
print(f"Total Rows: {len(df)}")
print(f"Total Columns: {len(df.columns)}")
print("\nColumn Names:")
print(", ".join(df.columns))

In [5]:
%python
df.info()

In [6]:
%python
# 2. Duplicate Analysis
duplicates = df.duplicated()
print("\n2. DUPLICATE RECORDS")
print(f"Total Duplicate Rows: {duplicates.sum()}")
if duplicates.sum() > 0:
    print("\nDuplicate Rows Sample:")
    print(df[duplicates].head())



In [7]:
%python
# 3. Missing Value Analysis

missing_values = df.isnull().sum()
missing_percentages = 100 * df.isnull().sum() / len(df)
missing_table = pd.concat([missing_values, missing_percentages], axis=1, keys=['Missing Values', 'Percentage Missing'])

print("\n3. MISSING VALUES")
missing_data = missing_table[missing_table['Missing Values'] > 0]
if len(missing_data) > 0:
    print(missing_data)
else:
    print("No missing values found.")



In [8]:
%python
# Unique Values in Categorical Columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("\n5. CATEGORICAL COLUMN UNIQUE VALUES")
for col in categorical_cols:
    unique_values = df[col].nunique()
    top_values = df[col].value_counts().head(5)
    print(f"\nColumn: {col}")
    print(f"Total Unique Values: {unique_values}")
    print("Top 5 Values:\n", top_values)



In [9]:
%python
#  Numerical Column Statistics
numerical_cols = df.select_dtypes(include=[np.number]).columns
print("\n6. NUMERICAL COLUMN STATISTICS")
numerical_stats = df[numerical_cols].describe(percentiles=[.25, .5, .75, .90, .99])
print(numerical_stats)



In [10]:
%python
#  Data Range and Outlier Indicators
print("\n7. POTENTIAL OUTLIER INDICATORS")
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"\nColumn: {col}")
    print(f"Potential Outliers: {len(outliers)} ({len(outliers)/len(df)*100:.2f}%)")
    print(f"Lower Bound: {lower_bound}")
    print(f"Upper Bound: {upper_bound}")

In [11]:
%python
# ===== Step 1: Remove Duplicates =====
print(f"Initial Rows: {df.shape[0]}")
df = df.drop_duplicates()
print(f"Rows After Removing Duplicates: {df.shape}")
# 1631846, 31

In [12]:
%python

# Check for duplicate rows
duplicates_specific = df[df.duplicated(subset=['N_SOUSCRIP'])]

# Display duplicate rows
print("Duplicate rows:")
print(duplicates)

# Get the count of duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")


In [13]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP'])
num_duplicates = duplicates.sum()
print(num_duplicates)


In [14]:
%python
df.drop_duplicates(subset=['N_SOUSCRIP'], keep='last', inplace=True)


In [15]:
%python
duplicates = df.duplicated(subset=['N_SOUSCRIP'])
num_duplicates = duplicates.sum()
print(num_duplicates)

In [16]:
%python

# Create an SQLite connection (in-memory database)
conn = sqlite3.connect(":memory:")  # ":memory:" creates an in-memory database
# Save the DataFrame as a SQL table
df.to_sql("FLAT", conn, index=False, if_exists="replace")



In [17]:
%python

# Query the table using SQL
query = "SELECT N_SOUSCRIP, COUNT(*) AS count, (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM FLAT)) AS percentage FROM FLAT GROUP BY N_SOUSCRIP ORDER BY count DESC LIMIT 10;"
result = pd.read_sql_query(query, conn)

# Step 4: Show the result
print(result)

In [18]:
%python

# Query the table using SQL
query = "SELECT * FROM FLAT WHERE N_SOUSCRIP = 642214 LIMIT 20"
result = pd.read_sql_query(query, conn)

print(result)



In [19]:
%python

query = '''SELECT * FROM FLAT WHERE year = 2021 AND N_SOUSCRIP = 642214 
           '''

result = pd.read_sql_query(query, conn)

print(result)

In [20]:
%python
df.shape
#  369406, 31
# 1631846 - 369406 = 1262440 DROPPED

In [21]:
%python

# ===== Step 2: Handle Missing Values =====
# Display missing values percentage
missing_info = df.isnull().mean() * 100
print("Missing Values Percentage:")
print(missing_info[missing_info > 0].sort_values(ascending=False))


In [22]:
%python

# Drop columns with >50% missing values
threshold = 50
df = df.loc[:, df.isnull().mean() * 100 < threshold]
print(f"Columns After Dropping >{threshold}% Missing Values: {df.shape[1]}")
# carroserie dropped 57% missing values


In [23]:
%python

# Fill missing values in categorical columns with 'Unknown'
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')
# replace missing with Unknown

In [24]:
%python

# Fill missing values in numerical columns with median
numerical_cols = df.select_dtypes(include=['number']).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())


In [25]:
%python
df.head()

# Z-score:
 Z-score  greater than 3 or less than -3 might be considered an outlier.

In [27]:
%python

# ===== Step 3: Handle Outliers =====
from scipy.stats import zscore

# Define a function to remove outliers using Z-score
def remove_outliers_zscore(data, threshold=3):
    z_scores = zscore(data.select_dtypes(include=['number']))
    abs_z_scores = abs(z_scores)
    return data[(abs_z_scores < threshold).all(axis=1)]

print(f"Rows Before Removing Outliers: {df.shape[0]}")
df = remove_outliers_zscore(df)
print(f"Rows After Removing Outliers: {df.shape[0]}")
# 339887 AFTER

In [28]:
%python

# ===== Step 4: Univariate Analysis =====
import matplotlib.pyplot as plt
import seaborn as sns

# Plot distributions for numerical columns
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30, color='blue')
    plt.title(f"Distribution of {col}")
    plt.show()


In [29]:
%python

# Plot bar plots for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    df[col].value_counts().head(10).plot(kind='bar', color='orange')
    plt.title(f"Top Categories in {col}")
    plt.show()



In [30]:
%python

# Correlation heatmap
plt.figure(figsize=(12, 8))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap")
plt.show()


In [31]:
%python

def check_normality(df, numerical_cols):
    print("Normality Tests (Shapiro-Wilk):")
    for col in numerical_cols:
        stat, p_value = stats.shapiro(df[col])
        print(f"{col}: p-value = {p_value}")
        print("Normally distributed" if p_value > 0.05 else "Not normally distributed")

check_normality(df, numerical_cols)


In [32]:
%python

# Pairplot for key numerical variables
def create_pairplot(df, numerical_cols, sample_size=5000):
    plt.figure(figsize=(15, 10))
    sample_df = df.sample(min(len(df), sample_size))
    sns.pairplot(sample_df[numerical_cols], diag_kind='kde')
    plt.suptitle("Pairplot of Numerical Variables", y=1.02)
    plt.show()
create_pairplot(df, numerical_cols)



In [33]:
%python
#  Visualization: Violin Plots
def create_violin_plots(df, numerical_cols):
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(numerical_cols, 1):
        plt.subplot(len(numerical_cols)//2 + 1, 2, i)
        sns.violinplot(x=df[col])
        plt.title(f'Violin Plot of {col}')
    plt.tight_layout()
    plt.show()

create_violin_plots(df, numerical_cols)