In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Pairplots

df = sns.load_dataset("tips")
sns.pairplot(df)
plt.show()


In [None]:
# Correlation Matrix

data = {'Student': ['A', 'B', 'C', 'D', 'E'],
        'Math': [90, 80, 70, 60, 50],
        'English': [70, 85, 65, 60, 55],
        'Science': [95, 78, 75, 62, 58]
}
df = pd.DataFrame(data)
corr_matrix = df[['Math','English','Science']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Regression Plot

data = {
    'Hours':[1, 2, 3, 4, 5],       # Independent variable
    'Scores':[50, 60, 70, 80, 90]  # Dependent variable
}
df = pd.DataFrame(data)
plt.figure(figure=(8,6))
sns.regplot(x='Hours', y='Scores', data=df)
plt.title('Regression Plot')
plt.show()

In [None]:
# Univariate Analysis
mean_age = df['Scores'].mean()
median_age = df['Scores'].median()
std_age = df['Scores'].std()
range_age = df['Scores'].max() - df['Scores'].min()
print(f"Mean Score: {mean_age}")
print(f"Median Score: {median_age}")
print(f"Standard Deviation of Score: {std_age}")
print(f"Range of Score: {range_age}")

df['Scores'].describe()


# **TASK 1:**

### **Problem Statement**:
* You are provided with a CSV dataset named products_data.csv, which contains information about various products, including features such as product specifications, features, and prices.
* Your task is to perform Exploratory Data Analysis (EDA) and build a predictive regression model to predict the price of a product based on its features.

---

### **Dataset**:
* CSV File Name: products_data.csv
* The dataset contains columns such as:
* Product specifications (e.g., weight, dimensions, color, etc.)
* Features (e.g., brand, category, etc.)
* Target Variable: Price (the price of each product)

---

### **Steps**:
1. `Data Loading and Preprocessing:`
    - Load the dataset from the CSV file products_data.csv.
    - Check for missing values and perform data cleaning (handle missing values if necessary).
    - Explore basic statistics and distribution of numerical features.
2. `Exploratory Data Analysis (EDA):`
    - Perform univariate analysis by visualizing the distribution of the target variable (Price).
    - Examine the correlation between features using a correlation matrix.
    - Use pair plots to visualize relationships between the features and the target variable (Price).
    - Visualize relationships between individual features and the target variable using regression plots.
3. `Predictive Modeling:`
    - Split the dataset into training and testing sets.
    - Build a linear regression model to predict the Price based on the features.
    - Evaluate the performance of the regression model using Mean Squared Error (MSE) and Root Mean Squared Error (RMSE).
4. `Model Evaluation:`
    - Visualize the comparison between actual and predicted values of the target variable (Price) using a scatter plot.

---

### **Requirements:**
* Perform all steps using Python and the libraries pandas, seaborn, matplotlib, and scikit-learn.
* The final model should predict the Price of products based on the features and be evaluated on a separate test set.
* Provide detailed visualizations that help in understanding the data and model performance.

---

### **Output:**
* The Python code used to perform EDA, train the model, and evaluate it.
* Visualizations and explanations for EDA results.
* Evaluation metrics (MSE and RMSE) of the trained regression model.
* A final conclusion on the quality of the model based on the evaluation results.

In [None]:
# Sample data set

data = sns.load_dataset("penguins")
data.isnull().sum()
data = data.dropna()
data.isnull().sum()
#data.describe()


In [None]:
corr_matrix = data[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.pairplot(data, hue='species', diag_kind='kde')
plt.show()

In [None]:
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(12, 6))
sns.regplot(x='bill_length_mm', y='bill_depth_mm', data=data, ax=ax1)
ax1.set_title('Bill Length vs Bill Depth')
sns.regplot(x='flipper_length_mm', y='body_mass_g', data=data, ax=ax2)
ax2.set_title('Flipper Length vs Body Mass')
plt.tight_layout()
plt.show()

# Predictive modelling in further topics