# <h1 style="text-align: center; color:magenta"> Prediction Section </h1>

## <h2 style="text-align: left; color:cyan"> Data Inspection </h2>

In [1]:
import os
import pandas as pd
import numpy as np

### Loading Data

In [2]:
# These functions are needed to check dtypes and convert if necessary
from pandas.api.types import is_string_dtype

In [3]:
# A dictionary of sheet description to better remember the purpose of each sheet.
sheets_desc = {"Data": "مشخصات پرسشنامه",
               "P1":"قسمت یکم: خصوصیات اجتماعی اعضای خانوار",
               "P2": "قسمت دوم: مشخصات محل سکونت",
               "P3S01": "قسمت سوم: بخش ۱ هزینه‌های خوراکی خانوار در ماه گذشته",
               "P3S02": "قسمت سوم: بخش ۲ هزینه‌های نوشیدنی‌های طبقه‌بندی نشده و دخانی های خانوار در ماه گذشته",
               "P3S03": "قسمت سوم: بخش ۳ هزینه‌های پوشاک و کفش خانوار در ماه گذشته",
               "P3S04": "قسمت سوم: بخش ۴ هزینه‌های بخش مسکن، آب، فاضلاب، سوخت و روشنایی خانوار در ماه گذشته",
               "P3S05": "قسمت سوم: بخش ۵ هزینه‌های مبلمان و لوازم خانگی",
               "P3S06": "قسمت سوم: بخش ۶ هزینه‌های بهداشتی و درمانی خانوار در ماه گذشته",
               "P3S07": "قسمت سوم: بخش ۷ هزینه‌های حمل و نقل خانوار در ماه گذشته",
               "P3S08": "قسمت سوم: بخش ۸ هزینه‌های ارتباطات خانوار در ماه گذشته",
               "P3S09": "قسمت سوم: بخش ۹ هزینه‌های خدمات فرهنگی و تفریحات خانوار در ماه گذشته",
               "P3S10": "قسمت سوم: بخش ۱۰ هزینه‌های آموزش و تحصیل",  # empty sheet, included in P3S13
               "P3S11": "قسمت سوم: بخش ۱۱ هزینه‌های غذاهای آماده، هتل و رستوران خانوار در ماه گذشته",
               "P3S12": "قسمت سوم: بخش ۱۲ هزینه‌های کالاها و خدمات متفرقه خانوار در ماه گذشته",
               "P3S13": "قسمت سوم: بخش ۱۳ سایر هزینه‌ها و انتقالات در ۱۲ ماه گذشته",
               "P3S14": "قسمت سوم: بخش ۱۴ سرمایه‌گذاری خانوار در ۱۲ ماه گذشته",
               "P4S01": "قسمت چهارم: بخش ۱ درآمد پولی اعضای شاغل خانوار از مشاغل مزد و حقوق بگیری",
               "P4S02": "قسمت چهارم: بخش ۲ درآمد پولی اعضای شاغل خانوار از مشاغل غیر مزد و حقوق بگیری (آزاد)",
               "P4S03": "قسمت چهارم: بخش ۳ درآمدهای متفرقه خانوار در ۱۲ ماه گذشته",
               "P4S04": "قسمت چهارم: ستون ۹ بخش ۳ شامل وام و یارانه"
               }

In [4]:
#Desired info:
sheet_indices = [0,1,2,9,10,13, 15, 17,18,19,20]

#### Reading the files

Looping through the data folder to get all the file names

In [5]:
resource_directory = "data"
resource_files : list[str] = []
for f in os.listdir(resource_directory):
    f_path = os.path.join(resource_directory, f)
    if os.path.isfile(f_path):
        resource_files.append(f)
del f, f_path

In [6]:
def empty_string_remover(df):
    for col in df.columns:
    # Checking if string dtype
        if is_string_dtype(df[col]):
            # replace cells with only empty strings with np.nan
            df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
    return df

In [7]:
dfs = []

for filename in resource_files:

    file_path = os.path.join(resource_directory, filename)
    year_file = pd.ExcelFile(file_path)

    if filename.startswith("U"):
        Urban = True
    else:
        Urban = False

    year = int(filename.strip(".xlsx")[1:])
    if (year==99) or (year ==98):
        year = 1300+year

    for sheet_index in sheet_indices:
        
        sheet_name = list(sheets_desc.keys())[sheet_index]
        print(f"Sheet name from sheets_desc dict: {sheet_name}.", 
            f"Sheet name from file: {year_file.sheet_names[sheet_index]}\n",
            f"Loaded sheet description: {sheets_desc[sheet_name]}")

        if sheet_index==0:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            # Here we add year and Urban columns to the Data (first sheet) dataframe
            current_df["Urban"] = Urban
            current_df["Year"] = year
            # Drop rows that:
                # TakmilDescA is not being nan, these only exist in year 1401.
                # TakmilDescB or JaygozinDescA or JaygozinDescB or JaygozinDescC is not nan,
                # Jaygozin ==2,
                # However, JaygozinDescC does not appear to have a specific definition.
                # Because all the above mean the end of survey for that family.

            current_df = current_df.loc[current_df.TakmilDescA.isna() | 
                                    current_df.TakmilDescB.isna() | 
                                    current_df.JaygozinDescA.isna() | 
                                    current_df.JaygozinDescB.isna() |
                                    current_df.Jaygozin!=2, :]
            if "JaygozinDescC" in current_df.columns:
                current_df = current_df.loc[current_df.JaygozinDescC.isna(), :]

            # We only consider addresses from this dataframe and do left join with other dfs
            year_df = current_df.loc[:, ["Address", "Urban", "Year", "Fasl", "khanevartype", "province", "town"]].copy()
            del current_df

        if sheet_index==1:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            # Counting family members of each family
            family_member_count = (current_df.groupby("Address")
                                .agg({"member": "count"})
                                .rename(columns={"member": "member_cnt"}).reset_index())

            # Counting employed or student members of a family,
            # presumably because the go outside more and have more transportation expense
            active_member_count = (current_df.loc[current_df.occupationalst.str.lower().isin(["employed", "student"]), 
                                                    ["Address", "occupationalst"]]
                                            .groupby("Address").agg({"occupationalst": "count"})
                                            .rename(columns={"occupationalst": "active_member_cnt"})
                                            .reset_index())
            family_mean_age = (current_df.groupby("Address").agg({"age": "mean"})
                                .rename(columns={"age": "mean_age"})
                                .reset_index())



            

            # Counting members with more than Diploma education
            # Maybe these people go out more often
            highly_educated_members_count = (current_df.loc[current_df.degree
                                                            .isin(['Bachelor', 'College', 'Master', 'PhD']), 
                                                            ["Address", "degree"]]
                                            .groupby("Address").agg({"degree": "count"})
                                            .rename(columns={"degree": "highly_educated_member_cnt"})
                                            .reset_index())

            year_df = pd.merge(year_df, family_member_count, how="left", on="Address")
            year_df = pd.merge(year_df, active_member_count, how="left", on="Address")
            year_df = pd.merge(year_df, family_mean_age, how="left", on="Address")
            year_df = pd.merge(year_df, highly_educated_members_count, how="left", on="Address")
            del current_df, family_member_count, active_member_count, family_mean_age, highly_educated_members_count


        # Maybe if they have vehicles, they spend less on public transportation cost
        # However, they might spend more on personal vehicle expenses
        if sheet_index==2:
            cols = ["Address","tenure", "vehicle", "motorcycle", "bicycle"]
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index], usecols = cols)
            current_df = empty_string_remover(current_df)
            year_df = pd.merge(year_df, current_df, how="left", on="Address")
            del current_df

        # Transportation expenses
        if sheet_index==9:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            transportation_cost_df = (current_df.groupby("Address").agg({"value": "sum"})
                                .rename(columns={"value": "transportation_cost"})
                                .reset_index())
            year_df = pd.merge(year_df, transportation_cost_df, how="left", on="Address")
            del current_df, transportation_cost_df

        # Communication expenses was seen to have a modest correlation with our target
        if sheet_index==10:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            current_df.value = pd.to_numeric(current_df.value, errors='coerce').fillna(0).astype("int64")
            current_df = (current_df.groupby("Address").agg({"value": "sum"})
                                .rename(columns={"value": "communication_expenses"})
                                .reset_index())
            year_df = pd.merge(year_df, current_df, how="left", on="Address")
            del current_df
        # Dining out more, might require more transportation costs for families
        if sheet_index==13:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            current_df = (current_df.groupby("Address").agg({"value": "sum"})
                                .rename(columns={"value": "dining_expenses"})
                                .reset_index())
            year_df = pd.merge(year_df, current_df, how="left", on="Address")
            del current_df

        # بخش 13 اطلاعات مفیدی در رابطه با حمل و نقل داره مانند بیمه مرتبط با حمل و نقل یا هزینه های مربوط به خرید وسایل نقلیه
        if sheet_index==15:
            current_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            current_df = empty_string_remover(current_df)
            current_df.value = pd.to_numeric(current_df.value, errors='coerce')
            current_df.value = current_df.value.fillna(0).astype("int64")
            # مخارج حق بیمه مرتبط با حمل و نقل خانوار
            transport_insurance_df = (current_df.loc[current_df.code.isin([125411,125412,125413]), :]
                                .groupby("Address").agg({"value":"sum"})
                                .rename(columns={"value": "transport_insurance_expenses"})
                                .reset_index())
            # هزینه های مربوط به خرید وسایل نقلیه و تجهیزات مربوطه
            vehicle_expenses_df = (current_df.loc[current_df.code.astype("str").str.match("^7[12]\d+$")]
                            .groupby("Address").agg({"value":"sum"})
                            .rename(columns={"value": "vehicle_expenses"})
                            .reset_index())
            year_df = pd.merge(year_df, transport_insurance_df, how="left", on="Address")
            year_df = pd.merge(year_df, vehicle_expenses_df, how="left", on="Address")
            del current_df, transport_insurance_df, vehicle_expenses_df

        if sheet_index==17:
            income_wage_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            income_wage_df = (empty_string_remover(income_wage_df).groupby("Address")
                                .agg({"netincome_w_y": "sum"})
                                .reset_index())
            year_df = pd.merge(year_df, income_wage_df, how="left", on="Address")
            del income_wage_df

        if sheet_index==18:
            #Calculating azad income
            income_azad_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            income_azad_df = empty_string_remover(income_azad_df)
            income_azad_df.income_s_y = pd.to_numeric(income_azad_df.income_s_y, errors='coerce')
            income_azad_df = income_azad_df.dropna(subset=['income_s_y'])
            income_azad_df.income_s_y = income_azad_df.income_s_y.astype("int64")
            income_azad_df = income_azad_df.groupby("Address").agg({"income_s_y": "sum"}).reset_index()
            year_df = pd.merge(year_df, income_azad_df, how="left", on="Address")
            del income_azad_df

        if sheet_index==19:
            # Calculating miscellaneous income
            income_misc_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            income_misc_df = empty_string_remover(income_misc_df).fillna(0).astype("int64")
            income_misc_df = income_misc_df.groupby("Address").agg({"income_pension": "sum",
                                                        "income_rent": "sum",
                                                        "income_interest": "sum",
                                                        "income_aid": "sum",
                                                        "income_resale": "sum",
                                                        "income_transfer": "sum",
                                                        }).sum(axis=1).to_frame(name='misc_income').reset_index()
            year_df = pd.merge(year_df, income_misc_df, how="left", on="Address")
            del income_misc_df

        if sheet_index==20:
            #Calculatin subsidy income
            income_subsidy_df = pd.read_excel(year_file, year_file.sheet_names[sheet_index])
            income_subsidy_df = empty_string_remover(income_subsidy_df).fillna(0).astype("int64")
            income_subsidy_df = income_subsidy_df.groupby("Address").agg({"subsidy":"sum"}).reset_index()
            year_df = pd.merge(year_df, income_subsidy_df, how="left", on="Address")
            del income_subsidy_df

    dfs.append(year_df)

Sheet name from sheets_desc dict: Data. Sheet name from file: R1400Data
 Loaded sheet description: مشخصات پرسشنامه
Sheet name from sheets_desc dict: P1. Sheet name from file: R1400P1
 Loaded sheet description: قسمت یکم: خصوصیات اجتماعی اعضای خانوار
Sheet name from sheets_desc dict: P2. Sheet name from file: R1400P2
 Loaded sheet description: قسمت دوم: مشخصات محل سکونت
Sheet name from sheets_desc dict: P3S07. Sheet name from file: R1400P3S07
 Loaded sheet description: قسمت سوم: بخش ۷ هزینه‌های حمل و نقل خانوار در ماه گذشته
Sheet name from sheets_desc dict: P3S08. Sheet name from file: R1400P3S08
 Loaded sheet description: قسمت سوم: بخش ۸ هزینه‌های ارتباطات خانوار در ماه گذشته
Sheet name from sheets_desc dict: P3S11. Sheet name from file: R1400P3S11
 Loaded sheet description: قسمت سوم: بخش ۱۱ هزینه‌های غذاهای آماده، هتل و رستوران خانوار در ماه گذشته
Sheet name from sheets_desc dict: P3S13. Sheet name from file: R1400P3S13
 Loaded sheet description: قسمت سوم: بخش ۱۳ سایر هزینه‌ها و انتقال

In [8]:
len(dfs)

8

In [9]:
final_df = pd.concat(dfs, axis=0)

In [10]:
final_df

Unnamed: 0,Address,Urban,Year,Fasl,khanevartype,province,town,member_cnt,active_member_cnt,mean_age,...,bicycle,transportation_cost,communication_expenses,dining_expenses,transport_insurance_expenses,vehicle_expenses,netincome_w_y,income_s_y,misc_income,subsidy
0,20001383919,False,1400,2,1,Markazi,1,5,2.0,20.80,...,False,600000.0,140000.0,,,488400000.0,354000000.0,,158600000.0,23100000.0
1,20001383923,False,1400,2,1,Markazi,1,4,3.0,32.50,...,False,2800000.0,650000.0,,,13600000.0,,636000000.0,,
2,20001383925,False,1400,2,1,Markazi,1,4,1.0,46.50,...,False,2350000.0,600000.0,,,5600000.0,,173000000.0,42320000.0,17640000.0
3,20001383929,False,1400,2,1,Markazi,1,2,1.0,28.00,...,False,3000000.0,1000000.0,,,,160000000.0,,23560000.0,6720000.0
4,20001383932,False,1400,2,1,Markazi,1,2,,61.50,...,False,2500000.0,290000.0,,,,,,416360000.0,10920000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19301,13006383816,True,1399,2,1,Alborz,6,2,2.0,27.50,...,True,3900000.0,400000.0,,,,245973334.0,,,
19302,13006383821,True,1399,2,1,Alborz,6,5,3.0,25.20,...,False,700000.0,1050000.0,4948000.0,,,364056000.0,210000000.0,56400000.0,26950000.0
19303,13006383824,True,1399,2,1,Alborz,6,4,2.0,26.75,...,False,1600000.0,1050000.0,650000.0,12000000.0,2700000.0,380000000.0,,53760000.0,21840000.0
19304,13006383826,True,1399,2,1,Alborz,6,5,3.0,25.00,...,False,1700000.0,850000.0,,9000000.0,4000000.0,400000000.0,,30500000.0,26250000.0


In [11]:
final_df.reset_index(drop=True).to_pickle("prediction_features_df.pkl")