In [26]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit

# Load the dataset
file_path = "E:\\data\\NSSO68.csv"
data = pd.read_csv(file_path, low_memory=False)

In [27]:
# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,slno,grp,Round_Centre,FSU_number,Round,Schedule_Number,Sample,Sector,state,State_Region,...,pickle_v,sauce_jam_v,Othrprocessed_v,Beveragestotal_v,foodtotal_v,foodtotal_q,state_1,Region,fruits_df_tt_v,fv_tot
0,1,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,0.0,1141.4924,30.942394,GUJ,2,12.0,154.18
1,2,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,17.5,1244.5535,29.286153,GUJ,2,333.0,484.95
2,3,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,0.0,1050.3154,31.527046,GUJ,2,35.0,214.84
3,4,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,33.333333,1142.591667,27.834607,GUJ,2,168.333333,302.3
4,5,4.099999999999999e+31,1,41000,68,10,1,2,24,242,...,0.0,0.0,0.0,75.0,945.2495,27.600713,GUJ,2,15.0,148.0


In [6]:
list(data.columns)

['slno',
 'grp',
 'Round_Centre',
 'FSU_number',
 'Round',
 'Schedule_Number',
 'Sample',
 'Sector',
 'state',
 'State_Region',
 'District',
 'Stratum_Number',
 'Sub_Stratum',
 'Schedule_type',
 'Sub_Round',
 'Sub_Sample',
 'FOD_Sub_Region',
 'Hamlet_Group_Sub_Block',
 't',
 'X_Stage_Stratum',
 'HHS_No',
 'Level',
 'Filler',
 'hhdsz',
 'NIC_2008',
 'NCO_2004',
 'HH_type',
 'Religion',
 'Social_Group',
 'Whether_owns_any_land',
 'Type_of_land_owned',
 'Land_Owned',
 'Land_Leased_in',
 'Otherwise_possessed',
 'Land_Leased_out',
 'Land_Total_possessed',
 'During_July_June_Cultivated',
 'During_July_June_Irrigated',
 'NSS',
 'NSC',
 'MLT',
 'land_tt',
 'Cooking_code',
 'Lighting_code',
 'Dwelling_unit_code',
 'Regular_salary_earner',
 'Perform_Ceremony',
 'Meals_seved_to_non_hhld_members',
 'Possess_ration_card',
 'Type_of_ration_card',
 'MPCE_URP',
 'MPCE_MRP',
 'Person_Srl_No',
 'Relation',
 'Sex',
 'Age',
 'Marital_Status',
 'Education',
 'Days_Stayed_away',
 'No_of_Meals_per_day',
 'Me

In [28]:
# Create a new feature called NV
data['NV'] = data[['eggsno_q', 'fishprawn_q', 'goatmeat_q', 'beef_q', 'pork_q', 'chicken_q', 'othrbirds_q']].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)

In [29]:
data.shape

(101662, 385)

In [30]:
df= data.copy()

In [31]:
df.dropna(how= 'all',inplace=True)

In [39]:
X.isnull().sum()

const                       0
HH_type                    27
Religion                    3
Social_Group               14
Regular_salary_earner      12
Possess_ration_card        13
Sex                         0
Age                         0
Marital_Status              2
Education                   7
Meals_At_Home            1219
Region                      0
hhdsz                       0
NIC_2008                 7511
NCO_2004                 7487
dtype: int64

In [37]:
# Assuming X is your DataFrame containing the independent variables
X['Social_Group'] = X['Social_Group'].astype('category')
X['Regular_salary_earner'] = X['Regular_salary_earner'].astype('category')
X['HH_type'] = X['HH_type'].astype('category')
X['Possess_ration_card'] = X['Possess_ration_card'].astype('category')
X['Sex'] = X['Sex'].astype('category')
X['Marital_Status'] = X['Marital_Status'].astype('category')
X['Education'] = X['Education'].astype('category')
X['Region'] = X['Region'].astype('category')

Unnamed: 0,const,HH_type,Religion,Social_Group,Regular_salary_earner,Possess_ration_card,Sex,Age,Marital_Status,Education,Meals_At_Home,Region,hhdsz,NIC_2008,NCO_2004
0,1.0,2.0,1.0,3.0,1.0,1.0,1,50,2.0,8.0,59.0,2,5,47510.0,411.0
1,1.0,2.0,3.0,9.0,1.0,1.0,2,40,3.0,12.0,56.0,2,2,85102.0,331.0
2,1.0,1.0,1.0,9.0,1.0,1.0,1,45,2.0,7.0,60.0,2,5,49219.0,121.0
3,1.0,2.0,3.0,9.0,1.0,1.0,1,75,3.0,6.0,60.0,2,3,49231.0,911.0
4,1.0,1.0,1.0,9.0,2.0,1.0,1,30,2.0,7.0,59.0,2,4,45403.0,121.0


In [38]:
# Add a constant term for the intercept
# Define dependent variable (y) and independent variables (X)
y = data['NV']
X = data[['HH_type', 'Religion', 'Social_Group', 'Regular_salary_earner', 
          'Possess_ration_card', 'Sex', 'Age', 'Marital_Status', 'Education', 
          'Meals_At_Home', 'Region', 'hhdsz', 'NIC_2008', 'NCO_2004']]
X= sm.add_constant(X)

# Fit the probit regression model
probit_model = Probit(y, X).fit()

# Print the summary of the model
print(probit_model.summary())

MissingDataError: exog contains inf or nans