In [None]:
# 1. Setting the working directory
import os
os.chdir("C:\\Users\\user\\Desktop\\VCU\\BOOT CAMP\\SCMA-632-C51 - STATISTICL ANALYSIS & MODELING\\VCU_christ")

# 2. Installing and Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats import weightstats as stests

# 3. Reading the dataset
df = pd.read_csv("NSSO68.csv", encoding="Latin-1", low_memory=False)

In [None]:
# 4. Filtering data for Nagaland
state_data = df[df['state_1'] == "NAG"]
state_data.to_csv("C:/Users/user/Desktop/VCU/BOOT CAMP/SCMA-632-C51 - STATISTICL ANALYSIS & MODELING/VCU_christ/nagaland_data.csv", index=False)


In [None]:
# 5. Display dataset information
print("Dataset Information:\n")
print("Column Names:")
print(state_data.columns.tolist())
print("\nFirst 5 Rows:")
print(state_data.head())
print("\nDimensions (rows, columns):")
print(state_data.shape)
print("\nTotal Missing Values:")
print(state_data.isna().sum().sum())

In [15]:
# 6. Check for missing values in each column
missing_values = state_data.isnull().sum().sort_values(ascending=False)
print("Missing Values per Column (Descending Order):\n")
print(missing_values)

# 7. Subsetting the dataset
state_subset = state_data[[
    'state_1', 'District', 'Region', 'Sector', 'State_Region',
    'Meals_At_Home', 'ricetotal_v', 'wheattotal_v', 'Milktotal_v',
    'pulsestot_v', 'nonvegtotal_v', 'fruitstt_v', 'No_of_Meals_per_day'
]]

Missing Values per Column (Descending Order):

soyabean_q        1024
soyabean_v        1024
Meals_School      1011
Meals_Employer    1006
Meals_Others       949
                  ... 
carrot_q             0
radish_q             0
brinjal_q            0
tamato_q             0
fv_tot               0
Length: 384, dtype: int64


In [17]:
# 8. Impute missing values with mean
print(" Missing Values Before Imputation:\n")
print(state_subset.isna().sum())

state_cleaned = state_subset.fillna(state_subset.mean(numeric_only=True))

print("\n Missing Values After Imputation:\n")
print(state_cleaned.isna().sum())

# 9. Removing outliers using IQR
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_threshold = Q1 - 1.5 * IQR
    upper_threshold = Q3 + 1.5 * IQR
    return df[(df[column_name] >= lower_threshold) & (df[column_name] <= upper_threshold)]

outlier_columns = [
    'Meals_At_Home', 'ricetotal_v', 'wheattotal_v', 'Milktotal_v',
    'pulsestot_v', 'nonvegtotal_v', 'fruitstt_v', 'No_of_Meals_per_day'
]

for col in outlier_columns:
    state_cleaned = remove_outliers(state_cleaned, col)

print("\n📋 Columns in the Cleaned Dataset:")
print(state_cleaned.columns.tolist())

🔍 Missing Values Before Imputation:

state_1                0
District               0
Region                 0
Sector                 0
State_Region           0
Meals_At_Home          1
ricetotal_v            0
wheattotal_v           0
Milktotal_v            0
pulsestot_v            0
nonvegtotal_v          0
fruitstt_v             0
No_of_Meals_per_day    0
dtype: int64

✅ Missing Values After Imputation:

state_1                0
District               0
Region                 0
Sector                 0
State_Region           0
Meals_At_Home          0
ricetotal_v            0
wheattotal_v           0
Milktotal_v            0
pulsestot_v            0
nonvegtotal_v          0
fruitstt_v             0
No_of_Meals_per_day    0
dtype: int64

📋 Columns in the Cleaned Dataset:
['state_1', 'District', 'Region', 'Sector', 'State_Region', 'Meals_At_Home', 'ricetotal_v', 'wheattotal_v', 'Milktotal_v', 'pulsestot_v', 'nonvegtotal_v', 'fruitstt_v', 'No_of_Meals_per_day']


In [25]:
# 10. Create total consumption variable
state_cleaned['total_consumption'] = state_cleaned[[
    'ricetotal_v', 'wheattotal_v', 'Milktotal_v',
    'pulsestot_v', 'nonvegtotal_v', 'fruitstt_v'
]].sum(axis=1)

# 11. Summarize consumption
def summarize_consumption(df, group_col):
    summary = df.groupby(group_col)['total_consumption'].sum().reset_index()
    summary = summary.sort_values(by='total_consumption', ascending=False)
    return summary

district_summary = summarize_consumption(state_cleaned, 'District')
region_summary = summarize_consumption(state_cleaned, 'Region')
sector_summary = summarize_consumption(state_cleaned, 'Sector')

print("\n Top 4 Consuming Districts:")
print(district_summary.head(4))
print("\n Region Consumption Summary:")
print(region_summary)
print("\n Sector Consumption Summary:")
print(sector_summary)
print("\n Bottom 4 Consuming Districts:")
print(district_summary.tail(4))



 Top 4 Consuming Districts:
     District  total_consumption
4  Mokokchung       45476.929762
2      Kohima       38150.321428
5         Mon       31954.354762
8    Tuensang       30952.435714

 Region Consumption Summary:
   Region  total_consumption
0       1      321038.960714

 Sector Consumption Summary:
  Sector  total_consumption
0  RURAL      209952.501190
1  URBAN      111086.459524

 Bottom 4 Consuming Districts:
   District  total_consumption
3  Longleng       26528.561905
1   Kiphire       23318.038095
9     Wokha       19026.159524
6     Peren       18259.480952


In [23]:
# 12. Rename district and sector codes
state_cleaned['District'] = state_cleaned['District'].astype(str)
state_cleaned['Sector'] = state_cleaned['Sector'].astype(str)

district_mapping = {
    "1": "Mon", "2": "Tuensang", "3": "Mokokchung", "4": "Zunheboto",
    "5": "Wokha", "6": "Dimapur", "7": "Kohima", "8": "Phek",
    "9": "Kiphire", "10": "Longleng", "11": "Peren"
}
sector_mapping = {"1": "RURAL", "2": "URBAN"}

state_cleaned['District'] = state_cleaned['District'].map(district_mapping).fillna(state_cleaned['District'])
state_cleaned['Sector'] = state_cleaned['Sector'].map(sector_mapping).fillna(state_cleaned['Sector'])

# Updated summaries
district_summary = summarize_consumption(state_cleaned, 'District')
region_summary = summarize_consumption(state_cleaned, 'Region')
sector_summary = summarize_consumption(state_cleaned, 'Sector')

print("\n Updated District Summary (After Mapping):")
print(district_summary.head(4))
print("\n Region Summary:")
print(region_summary)
print("\n Sector Summary:")
print(sector_summary)



 Updated District Summary (After Mapping):
     District  total_consumption
4  Mokokchung       45476.929762
2      Kohima       38150.321428
5         Mon       31954.354762
8    Tuensang       30952.435714

 Region Summary:
   Region  total_consumption
0       1      321038.960714

 Sector Summary:
  Sector  total_consumption
0  RURAL      209952.501190
1  URBAN      111086.459524


In [1]:
# 13. Z-Test: Urban vs Rural
consumption_rural = state_cleaned[state_cleaned['Sector'] == 'RURAL']['total_consumption']
consumption_urban = state_cleaned[state_cleaned['Sector'] == 'URBAN']['total_consumption']

z_statistic, p_value = stests.ztest(consumption_rural, consumption_urban, alternative='two-sided')

print("\n Z-Test for Rural vs Urban Consumption")
print("Z-Score:", round(z_statistic, 4))
print("P-Value:", round(p_value, 4))

if p_value < 0.05:
    print("Significant difference between Rural and Urban mean consumption (Reject H₀)")
else:
    print("No significant difference between Rural and Urban mean consumption (Fail to reject H₀)")

# 14. Z-Test Between Top and Bottom Consuming Districts
top_district = district_summary.head(1).iloc[0]['District']
bottom_district = district_summary.tail(1).iloc[0]['District']

top_data = state_cleaned[state_cleaned['District'] == top_district]['total_consumption']
bottom_data = state_cleaned[state_cleaned['District'] == bottom_district]['total_consumption']

z_statistic, p_value = stests.ztest(top_data, bottom_data, alternative='two-sided')

print(f"\n Z-Test: {top_district} vs {bottom_district}")
print("Z-Score:", round(z_statistic, 4))
print("P-Value:", round(p_value, 4))

if p_value < 0.05:
    print(f"Significant difference between {top_district} and {bottom_district} mean consumption (Reject H₀)")
else:
    print(f" No significant difference between {top_district} and {bottom_district} mean consumption (Fail to reject H₀)")


NameError: name 'state_cleaned' is not defined