In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [2]:
pip install pandas statsmodels numpy


Note: you may need to restart the kernel to use updated packages.


In [4]:
# Set working directory
import os
os.chdir('C:\\Users\\HP\\Desktop')
print(os.getcwd())

C:\Users\HP\Desktop


In [5]:
# Load the dataset
data = pd.read_csv("NSSO68.csv" ,  low_memory=False)

In [6]:
# Display unique values in 'state_1' column
print(data['state_1'].unique())

['GUJ' 'ORI' 'CHTSD' 'MP' 'JRKD' 'WB' 'AP' 'MH' 'D&D' 'D&NH' 'MIZ' 'TRPR'
 'MANPR' 'ASSM' 'MEG' 'NAG' 'A&N' 'PNDCRY' 'TN' 'GOA' 'KA' 'KE' 'LKSDP'
 'SKM' 'Bhr' 'UP' 'RJ' 'ARP' 'DL' 'HR' 'Pun' 'HP' 'UT' 'Chandr' 'J$K']


In [7]:

# Subset data for state 'KA'
subset_data = data[['foodtotal_q', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education', 'No_of_Meals_per_day']]


In [8]:
# Print subset data
print(subset_data)

        foodtotal_q  MPCE_MRP  MPCE_URP  Age  Meals_At_Home  \
0         30.942394   3662.65   3304.80   50           59.0   
1         29.286153   5624.51   7613.00   40           56.0   
2         31.527046   3657.18   3461.40   45           60.0   
3         27.834607   3260.37   3339.00   75           60.0   
4         27.600713   2627.54   2604.25   30           59.0   
...             ...       ...       ...  ...            ...   
101657    28.441750    832.59    817.00   39           90.0   
101658    25.490282    862.13    773.20   38           90.0   
101659    25.800107    711.37    663.29   42           90.0   
101660    30.220170   1048.32    847.20   40           90.0   
101661    26.157279    834.03    689.57   60           90.0   

        Possess_ration_card  Education  No_of_Meals_per_day  
0                       1.0        8.0                  2.0  
1                       1.0       12.0                  2.0  
2                       1.0        7.0                  2

In [9]:
# Check for missing values
print(subset_data['MPCE_MRP'].isna().sum())
print(subset_data['MPCE_URP'].isna().sum())
print(subset_data['Age'].isna().sum())
print(subset_data['Possess_ration_card'].isna().sum())
print(data['Education'].isna().sum())


0
0
0
13
7


In [10]:
# Impute missing values with mean
def impute_with_mean(df, columns):
    for col in columns:
        df[col].fillna(df[col].mean(), inplace=True)

In [11]:
# Function to impute missing values with mean
def impute_with_mean(df, columns):
    for col in columns:
        df[col].fillna(df[col].mean(), inplace=True)
    return df


In [12]:
# Columns to impute
columns_to_impute = ['Education', 'MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card']


In [14]:
# Impute missing values with mean in the subset data
subset_data = impute_with_mean(subset_data, columns_to_impute)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].fillna(df[col].mean(), inplace=True)


In [15]:
# Ensure no infinite values
subset_data = subset_data.replace([np.inf, -np.inf], np.nan)


In [16]:
# Drop rows with any remaining NaN values
subset_data.dropna(inplace=True)


In [17]:
# Fit the regression model
X = subset_data[['MPCE_MRP', 'MPCE_URP', 'Age', 'Meals_At_Home', 'Possess_ration_card', 'Education']]
X = sm.add_constant(X)  # Add a constant term for the intercept
y = subset_data['foodtotal_q']
model = sm.OLS(y, X).fit()


In [18]:
# Print the regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:            foodtotal_q   R-squared:                       0.160
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     3215.
Date:                Tue, 16 Jul 2024   Prob (F-statistic):               0.00
Time:                        23:26:26   Log-Likelihood:            -3.6905e+05
No. Observations:              101637   AIC:                         7.381e+05
Df Residuals:                  101630   BIC:                         7.382e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  15.8348    

In [19]:
# Check for multicollinearity using Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif_data)  # VIF value more than 8 is problematic


               feature        VIF
0                const  53.506630
1             MPCE_MRP   1.618222
2             MPCE_URP   1.460368
3                  Age   1.089462
4        Meals_At_Home   1.035366
5  Possess_ration_card   1.092325
6            Education   1.180639


In [20]:
# Extract the coefficients from the model
coefficients = model.params


In [21]:

# Construct the equation
equation = f"y = {round(coefficients[0], 2)}"
for i in range(1, len(coefficients)):
    equation += f" + {round(coefficients[i], 6)}*x{i}"
print(equation)



y = 15.83 + 0.00165*x1 + -4e-06*x2 + 0.078118*x3 + 0.052572*x4 + -2.416189*x5 + 0.121986*x6


  equation = f"y = {round(coefficients[0], 2)}"
  equation += f" + {round(coefficients[i], 6)}*x{i}"


In [22]:

# Display the first values of selected columns
print(subset_data['MPCE_MRP'].head(1).values[0])
print(subset_data['MPCE_URP'].head(1).values[0])
print(subset_data['Age'].head(1).values[0])
print(subset_data['Meals_At_Home'].head(1).values[0])
print(subset_data['Possess_ration_card'].head(1).values[0])
print(subset_data['Education'].head(1).values[0])
print(subset_data['foodtotal_q'].head(1).values[0])

3662.65
3304.8
50
59.0
1.0
8.0
30.942394
