In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline

In [None]:
pd.set_option('display.max_columns',85)
pd.set_option('display.max_rows',85)

In [None]:
# Path to the .arff file
file_path = r"D:\DataMining\Chronic Kidney Disease\ChronicKidneyDisease_dataset.csv"

In [None]:
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df.shape

## Practical Question 1
#### Apply data cleaning techniques on any dataset (e.g. Chronic Kidney Disease dataset from UCI repository). Techniques may include handling missing values, outliers and inconsistent values. Also, a set of validation rules may be specified for the particular dataset and validation checks performed.

#### Handeling Missing values

In [None]:
df.head()

In [None]:
# Strip unwanted characters (\t) from all string columns
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

## We can see that missing value is filles with "?"
df.replace("?", np.nan, inplace = True)

In [None]:
df.info()

In [None]:
# Droping if a full Row is NULL

df.dropna(axis = 'index', how = 'all', inplace = True)

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
# List of columns to convert to float
float_columns = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 
                 'sc', 'sod', 'pot', 'hemo', 'pcv', 'rc', 'wc']

# Convert each column to float
df[float_columns] = df[float_columns].astype('float')

In [None]:
df.info()

In [None]:
df.isna().sum().sort_values(ascending = False)

In [None]:
## Column bifercation

numerical_cols = df.select_dtypes(include=['float64']).columns
non_numerical_cols = df.select_dtypes(include = ['object']).columns

## Using pandas

In [None]:
#Impute missing values (e.g., using mean or median or mode) 

#For example, imputing with mean for numerical columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

for col in non_numerical_cols:
    # Ensure mode is not empty, then fill missing values with the mode
    mode_value = df[col].mode()[0]
    df[col] = df[col].fillna(mode_value)

## Using sklearn

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
impute_median = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_mode = SimpleImputer(missing_values=np.nan, strategy = "most_frequent")

In [None]:
df[numerical_cols] = imputer_mode.fit_transform(df[numerical_cols])
df[non_numerical_cols] = imputer_mode.fit_transform(df[non_numerical_cols])

In [None]:
df.isna().sum().loc[lambda x : x > 0]

In [None]:
# Set number of rows and columns for subplots
n_cols = 3  # Number of plots per row
n_rows = (len(numerical_cols) + n_cols - 1) // n_cols  # Calculate rows needed (ceil division)

plt.figure(figsize=(16, 4 * n_rows))  # Adjust figure size based on rows

for idx, col in enumerate(numerical_cols, start=1):
    plt.subplot(n_rows, n_cols, idx)  # Arrange subplots in grid
    sns.boxplot(data=df, x=col)  # Boxplot for each column
    plt.title(col, fontsize=10)  # Add a title for each subplot

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()

# Outliers

## Inter Quartile Range (IQR)
1. Sort data
2. Calculate Q1(25%) and Q3(75%)
3. IQR = Q3 - Q1
4. Find the lower Fence (Q1 - 1.5(IQR))
5. Find the upper Fence (Q3 + 1.5(IQR))

## Trimming Outliers

In [None]:
trim_df = df.copy()

def trim_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    return df[(df[column] <= upper_limit) & (df[column] >= lower_limit)]

# Apply to relevant numerical columns
for col in numerical_cols:
    trim_df = trim_outliers_iqr(trim_df, col)

## Capping

In [None]:
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    df[column] = np.where(df[column] > upper_limit, upper_limit,
                                             np.where(df[column] < lower_limit, lower_limit, df[column]))
    return df[column]

for col in numerical_cols:
    df[col] = cap_outliers(df, col)

### After Capping Visualization

In [None]:
# Set number of rows and columns for subplots
n_cols = 3  # Number of plots per row
n_rows = (len(numerical_cols) + n_cols - 1) // n_cols  # Calculate rows needed (ceil division)

plt.figure(figsize=(16, 4 * n_rows))  # Adjust figure size based on rows

for idx, col in enumerate(numerical_cols, start=1):
    plt.subplot(n_rows, n_cols, idx)  # Arrange subplots in grid
    sns.boxplot(data=df, x=col)  # Boxplot for each column
    plt.title(col, fontsize=10)  # Add a title for each subplot

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()

### After trimming Visualization

In [None]:
# Set number of rows and columns for subplots
n_cols = 3  # Number of plots per row
n_rows = (len(numerical_cols) + n_cols - 1) // n_cols  # Calculate rows needed (ceil division)

plt.figure(figsize=(16, 4 * n_rows))  # Adjust figure size based on rows

for idx, col in enumerate(numerical_cols, start=1):
    plt.subplot(n_rows, n_cols, idx)  # Arrange subplots in grid
    sns.boxplot(data=trim_df, x=col)  # Boxplot for each column
    plt.title(col, fontsize=10)  # Add a title for each subplot

plt.tight_layout()  # Adjust layout to avoid overlap
plt.show()

## Practical Question 3
#### Apply simple K-means algorithm for clustering any dataset. Compare the performance of clusters by varying the algorithm parameters. For a given set of parameters, plot a line graph depicting MSE obtained after each iteration.

In [None]:
from sklearn.datasets import make_blobs
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('student_clustering.csv')

In [None]:
df.head()

In [None]:
df.shape

#### students data of 200 students with Attribute like cgpa and IQ
#### on the basic of these two values we need to cluster them

In [None]:
plt.scatter(df['cgpa'], df['iq'])
plt.xlabel('cgpa')
plt.ylabel('iq')
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
wcss = []

for i in range(1, 11):
    km = KMeans(n_clusters = i)
    km.fit_predict(df)
    wcss.append(km.inertia_)

In [None]:
wcss

In [None]:
plt.plot(range(1,11), wcss)
plt.grid()
plt.show()

In [None]:
X = df.iloc[:,:].values
km = KMeans (n_clusters = 4)
y_means = km.fit_predict(X)

In [None]:
y_means

In [None]:
X[y_means == 3, 1]

In [None]:
plt.scatter(X[y_means == 0, 0], X[y_means == 0, 1], color = 'blue')
plt.scatter(X[y_means == 1, 0], X[y_means == 1, 1], color = 'red')
plt.scatter(X[y_means == 2, 0], X[y_means == 2, 1], color = 'green')
plt.scatter(X[y_means == 3, 0], X[y_means == 3, 1], color = 'yellow')

plt.xlabel('cgpa')
plt.ylabel('iq')
plt.show()