In [1]:
import pandas as pd

In [2]:
# Create a DataFrame using dictionary of lists
data = {
    'R&B Spend' : [160000,230000,400000,250000,800000,750000],
    'Marketing spend' : [40000,50000,60000,35000,20000,75000],
    'Administration' : [120000,130000,140000,150000,160000,162300],
    'profit' : [30000,20000,230230,212230,50000,230000]
}
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,R&B Spend,Marketing spend,Administration,profit
0,160000,40000,120000,30000
1,230000,50000,130000,20000
2,400000,60000,140000,230230
3,250000,35000,150000,212230
4,800000,20000,160000,50000


In [3]:
df.columns

Index(['R&B Spend', 'Marketing spend', 'Administration', 'profit'], dtype='object')

In [4]:
# splitting the features into X and y feaatures
X = df[['R&B Spend', 'Marketing spend', 'Administration']]
y_dataframe = df[['profit']]
y_series = df['profit']

In [7]:
y_dataframe.head()

Unnamed: 0,profit
0,30000
1,20000
2,230230
3,212230
4,50000


In [6]:
y_series.head()

0     30000
1     20000
2    230230
3    212230
4     50000
Name: profit, dtype: int64

##### In the above two statements y_dataframe.head() & y_series.head() - main difference is column index. It's visible with DataFrame and not visible with Series

In [8]:
# When we are assigning multiple columns to X. It automatically considered as DataFrame.
# But when we are assigning a single column (df['profit']), then it will generate it as Series.
# Series object won't have column index. It's considered as an array.

#To avoid the above problem - assign it a y = df[['profit']]. Now this will make y as dataframe instead of series

In [10]:
# (model.coef_).shape --> (1, 3) --> 2D Array
# (model.coef_.ravel()).shape --> (3,) --> 1D Array

# ravel() is used for converting a 2D array into 1D array. 

#### Creating a Correlation Matrix

##### Correlation_matrix = df.corr()
##### print(Correlation_matrix)

##### Refer the below format for creating a table in Markdown - Jupyter notebook

|         | Size    | Bedrooms |  Distance_from_City |    Price |
|:--------|:-------:|:--------:|:-------------------:|---------:|
|Size     | 1.000000| 0.618609 |      0.424136       | 0.996648 |
| Bedrooms| 0.618609| 1.000000 |      0.972345       | 0.650000 |
| Distance| 0.424136| 0.972345 |      1.000000       | 0.458124 |
| Price   | 0.996648| 0.650000 |      0.458124       | 1.000000 |

#### Understanding Correlation in Data Science

In [11]:
# r = 1, Perfect positive correlation ( Variables move in same direction)
# r =-1, Perfect negative correlation ( Variables move in opposite direction)
# The diagonal always shows 1 because it’s the correlation of a variable with itself.
# Look at off-diagonal values to identify correlations between different independent variables.
# If r > 0.8 or r < -0.8, the variables are highly correlated → Possible multicollinearity issue.

##### What to Do If You Find High Correlation?

In [None]:
# If two features are highly correlated (|r| > 0.8), take action:
# 1. Drop One of the Features
#      Keep the one that makes more sense from a business perspective.
#      df = df.drop(columns=['Feature2'])  # Drop the redundant feature

# 2. Use PCA (Principal Component Analysis)
#      PCA reduces correlated variables into uncorrelated components.
#from sklearn.decomposition import PCA
#pca = PCA(n_components=2)  # Reduce dimensions
#new_features = pca.fit_transform(df[['Feature1', 'Feature2', 'Feature3']])

# 3. Apply Ridge Regression (Regularization)
#      Ridge Regression reduces the impact of collinearity using an L2 penalty.
#      Instead of removing variables, it shrinks their effect.
#from sklearn.linear_model import Ridge
#ridge_model = Ridge(alpha=1.0)
#ridge_model.fit(df[['Feature1', 'Feature2', 'Feature3']], df['Target'])

##### What to Do If You Find Medium Correlation?

In [12]:
# Moderate correlation is 0.5 ≤ |r| < 0.8.
# For moderate correlation (0.5 ≤ |r| < 0.8), it’s not always necessary to take action, 
# but you should analyze further to ensure it doesn't negatively impact your model.

# Even if correlation is moderate, check if it causes multicollinearity using VIF.
# VIF < 5 → No issue ✅
# VIF between 5-10 → Moderate multicollinearity ⚠️ (Consider reducing)
# VIF > 10 → High multicollinearity ❌ (Remove or transform variables)

##### Handling Multicollinearity - Multicollinearity occurs when independent variables are highly correlated. Detect using the Variance Inflation Factor (VIF):

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

| Feature            | VIF       |
|--------------------|----------|
| Size              | 24.911368 |
| Bedrooms          | 55.152882 |
| Distance_from_City | 15.444262 |