# EDA and data analysis

In [None]:
#Import all the libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Shape of the data
df.shape

# C:\Users\hp\Documents\ds\ds_materials\Machine Learning\ml2\class_11_1_car_prediction.ipynb
df.info

# Check if there are any null values

df.isnull().sum()

## Univariant
# code to plot distribution of schools and checking the count of GP & MS
plt.figure(figsize=(8, 5))
sns.countplot(x='school', data=df, palette='pastel')
plt.title('Distribution of Schools')
plt.xlabel('School')
plt.ylabel('Count')
plt.show()

### QA
1.**Question:** What is the most common age group among the students in the dataset?

**Answer:** The most common age group among the students appears to be around 15 to 16 years old.

2.**Question:** Is the distribution of age skewed towards younger or older students?

**Answer:** The distribution of age appears to be slightly skewed towards younger students, to the left side (lower ages). However, there is still a significant number of students in the older age groups, suggesting a relatively balanced distribution overall.

In [None]:
reason_counts = df['reason'].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(reason_counts, labels=reason_counts.index, autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral', 'lightskyblue'])
plt.title('Distribution of School Reputation as Reason for Choosing School')
plt.show()

df['Walc'].value_counts()

1: Very low level of weekend alcohol consumption.

2: Low level of weekend alcohol consumption.

3: Moderate level of weekend alcohol consumption.

4: High level of weekend alcohol consumption.

5: Very high level of weekend alcohol consumption.

In [None]:
## Bi-variant

plt.figure(figsize=(12, 6))

# Plotting Mother's Education Level
plt.subplot(1, 2, 1)
sns.countplot(x='Medu', data=df, palette="Set2")
plt.xlabel("Mother's Education Level")
plt.ylabel('Count')
plt.title("Mother's Education Level Distribution")

# Plotting Father's Education Level
plt.subplot(1, 2, 2)
sns.countplot(x='Fedu', data=df, palette="Set3")
plt.xlabel("Father's Education Level")
plt.ylabel('Count')
plt.title("Father's Education Level Distribution")

plt.tight_layout()
plt.show()



In [None]:
## Multi-variant

plt.figure(figsize=(12, 8))
sns.barplot(x='sex', y='G3', hue='address', data=df, palette='Set2')
plt.title('Final Grade (G3) by Gender and Address')
plt.xlabel('Gender')
plt.ylabel('Final Grade (G3)')
plt.legend(title='Address')
plt.show()



# Selecting the numerical columns for visualization
numerical_columns = ['age', 'traveltime', 'studytime', 'absences', 'G1', 'G2', 'G3']

# Creating a box plot for multiple columns
plt.figure(figsize=(12, 8))
sns.boxplot(data=df[numerical_columns], palette='Set2')
plt.title('Box Plot of Multiple Columns')
plt.ylabel('Value')
plt.xlabel('Column')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()



import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
correlation_matrix = df.corr(numeric_only = True)

# Plot the heatmap with adjusted font size
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5, annot_kws={"size": 10})
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# outlier

def detect_outliers(columns):
    outlier_indices = []
    for column in columns:
        Q1 = students_grades_df[column].quantile(0.25)
        Q3 = students_grades_df[column].quantile(0.75)
        IQR = Q3 - Q1  # IQR is interquartile range.

        mask = (students_grades_df[column] >= Q1 - 1.5 *
                IQR) & (students_grades_df[column] <= Q3 + 1.5 * IQR)
        mask = mask.to_numpy()
        false_indices = np.argwhere(~mask)
        outlier_indices.append(false_indices)
    return np.unique(np.concatenate(outlier_indices).ravel())

numerical_columns = ['age', 'absences']
outlier_indices = detect_outliers(numerical_columns)

In [None]:
# Split dataset
X, y = students_grades_df.iloc[:, :-1], students_grades_df.iloc[:, -1]


from sklearn.model_selection import train_test_split
# Create train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


## Encoding Data

In [None]:
# First we need to know which columns are binary, nominal and numerical
def get_columns_by_category():
    categorical_mask = X.select_dtypes(
        include=['object']).apply(pd.Series.nunique) == 2
    numerical_mask = X.select_dtypes(
        include=['int64', 'float64']).apply(pd.Series.nunique) > 5

    binary_columns = X[categorical_mask.index[categorical_mask]].columns
    nominal_columns = X[categorical_mask.index[~categorical_mask]].columns
    numerical_columns = X[numerical_mask.index[numerical_mask]].columns

    return binary_columns, nominal_columns, numerical_columns

In [None]:
binary_columns, nominal_columns, numerical_columns = get_columns_by_category()



In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
# Now we can create a column transformer pipeline
transformers = [('binary', OrdinalEncoder(), binary_columns),
                ('nominal', OneHotEncoder(), nominal_columns),
                ('numerical', StandardScaler(), numerical_columns)]

transformer_pipeline = ColumnTransformer(transformers, remainder='passthrough')

transformer_pipeline

# So now according to columns we have performed transformations. now finally we can go with model training.