# Preliminary Exploration

In [None]:
import os

import pandas as pd

In [None]:
data_path = "../../data_collection/data/processed"

pgm_india = pd.read_csv(os.path.join(data_path, "india_pgms.csv"))
pgm_usa = pd.read_csv(os.path.join(data_path, "usa_pgms.csv"))

curr_india = pd.read_csv(os.path.join(data_path, "india_curr.csv"))
curr_usa = pd.read_csv(os.path.join(data_path, "usa_curr.csv"))

In [None]:
def compile_summary(df):

    print(f"Number of variables: {df.shape[1]}")
    print(f"Number of data points: {df.shape[0]}")

    fname = []
    ftype = []
    fmiss = []
    funiq = []
    
    for col in df.columns:
        fname.append(col)
        ftype.append(df[col].dtype)
        fmiss.append(df[col].isna().sum())
        funiq.append(len(df[col].unique()))

    fdf = pd.DataFrame(
        dict(
            feature_name = fname,
            feature_type = ftype,
            missing_values = fmiss,
            unique_values = funiq
        )
    )

    # get rid of columns with no data
    fdf = fdf[fdf.missing_values != df.shape[0]]

    print(fdf)


In [None]:
compile_summary(pgm_india)

Number of variables: 17
Number of data points: 33
                feature_name feature_type  missing_values  unique_values
0                   uni_name       object               0             18
1                   pgm_name       object               0             23
2                    dur_yrs        int64               0              1
3                       type       object               0              2
4           national_rank_qs        int64               0             17
5              dept_involved       object               0             24
6                   location       object               0             11
7                        url       object               0             33
8                      descr       object               4             30
9                 header_tag       object              12             15
10              header_names       object              12             22
11                     links       object              12             20
1

1. 18 unique Indian universities
2. 33 unique data programs
3. 23 unique program names
4. 24 unique departments involved (But not a great insight as names could be different while purpose remains same)
5. 11 unique Indian states
6. 30 programs have program descriptions on the website
7. 11 programs use online PDF documents to detail their program as compared to the use of a program webpage
    - Naturally, these document descriptions are very elaborate

In [None]:
compile_summary(pgm_usa)

Number of variables: 17
Number of data points: 81
                feature_name feature_type  missing_values  unique_values
0                   uni_name       object               0             37
1                   pgm_name       object               0             49
2                    dur_yrs      float64              26              4
3                       type       object               0              2
4           national_rank_qs        int64               0             34
5              dept_involved       object               0             49
6                   location       object               0             32
7                        url       object               0             81
8                      descr       object               0             81
9                 header_tag       object               0             37
10              header_names       object               0             79
11                     links       object               0             81
1

1. 37 unique US universities
2. 81 unique programs
3. 49 unique program names
4. 49 diff departments involved
5. All programs choose to display most program-related information on a program webpage

In [None]:
compile_summary(curr_india)

Number of variables: 4
Number of data points: 389
                 feature_name feature_type  missing_values  unique_values
0                         url       object               0             33
1           compulsory course       object               3            313
2  course outcome or overview       object             251            134
3              topics covered       object             282            108


1. 3 programs have no mention of compulsory courses
2. ~250 compulsory courses have no course outcome or overview
3. ~280 compulsory courses have no information on topics covered

Fix
1. Find how many have neither

In [None]:
compile_summary(curr_usa)

Number of variables: 6
Number of data points: 613
                 feature_name feature_type  missing_values  unique_values
0                         url       object               0             81
1           compulsory course       object               0            533
2  course outcome or overview       object             163            388
3              topics covered       object             599             15


1. ~160 compulsory courses have no course outcome or overview
2. ~600 compulsory courses have no topics covered mention

Fix
1. Talk about the distinction between course outcome and topics covered
    - Why is this more prominent in Indian programs and less pronounced in programs in the USA

In [None]:
def summarize_curriculum(df, name="country"):

    print(f"Summary of data program curriculums in {name}")
    print(f"A. Dataset Information")
    print(f"  Number of programs: {len(df.url.unique())}")
    print(f"  Median number of compulsory courses per program: {df.groupby(['url'], as_index=False).agg({'compulsory course': 'count'})['compulsory course'].median()}")
    
    df = df.fillna("Not inferred")
    print(f"  Median number of words per course outcomes: {df['course outcome or overview'].apply(lambda x: len(x) if x != 'Not inferred' else 0).median()}")

In [None]:
summarize_curriculum(curr_india, "India")

Summary of data program curriculums in India
A. Dataset Information
  Number of programs: 33
  Median number of compulsory courses per program: 12.0
  Median number of words per course outcomes: 0.0


In [None]:
summarize_curriculum(curr_usa, "USA")

Summary of data program curriculums in USA
A. Dataset Information
  Number of programs: 81
  Median number of compulsory courses per program: 7.0
  Median number of words per course outcomes: 232.0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=596b9f0a-2431-4aa3-878c-95287ebfbe9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>