In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
import atoti as tt

Welcome to atoti 0.7.1!

By using this community edition, you agree with the license available at https://docs.atoti.io/latest/eula.html.
Browse the official documentation at https://docs.atoti.io.
Join the community at https://www.atoti.io/register.

atoti collects telemetry data, which is used to help understand how to improve the product.
If you don't wish to send usage data, set the ATOTI_DISABLE_TELEMETRY environment variable to True.

You can hide this message by setting the ATOTI_HIDE_EULA_MESSAGE environment variable to True.


### Load Data

In [3]:
courses_frame = pd.read_csv('data/courses.csv')

In [4]:
std_info_frame = pd.read_csv('data/studentInfo.csv')

In [5]:
assesments_frame = pd.read_csv('data/assessments.csv')

In [6]:
stdAsses_frame = pd.read_csv('data/studentAssessment.csv')

### Ambil fitur-fitur pada tabel yang akan untuk dianalisis

#### Pilih fitur dari Tabel course

In [7]:
# ambil feature yang diperlukan pada dataframe std_asses_frame
courses = pd.DataFrame(courses_frame, columns=['code_presentation', 'code_module', 'module_presentation_length'])

# tampilkan data
courses.head(5)

Unnamed: 0,code_presentation,code_module,module_presentation_length
0,2013J,AAA,268
1,2014J,AAA,269
2,2013J,BBB,268
3,2014J,BBB,262
4,2013B,BBB,240


#### Pilih fitur dari Tabel studentInfo

In [8]:
# ambil feature yang diperlukan pada dataframe std_asses_frame
stdInfo = pd.DataFrame(std_info_frame, columns=['id_student', 'code_module', 'final_result', 'gender','imd_band',
                                                'highest_education', 'age_band', 'num_of_prev_attempts',
                                                'studied_credits', 'region'])

# id perlu diubah menjadi string
stdInfo['id_student'] = stdInfo['id_student'].astype('string')

# tampilkan data
stdInfo.head(5)


Unnamed: 0,id_student,code_module,final_result,gender,imd_band,highest_education,age_band,num_of_prev_attempts,studied_credits,region
0,11391,AAA,Pass,M,90-100%,HE Qualification,55<=,0,240,East Anglian Region
1,28400,AAA,Pass,F,20-30%,HE Qualification,35-55,0,60,Scotland
2,30268,AAA,Withdrawn,F,30-40%,A Level or Equivalent,35-55,0,60,North Western Region
3,31604,AAA,Pass,F,50-60%,A Level or Equivalent,35-55,0,60,South East Region
4,32885,AAA,Pass,F,50-60%,Lower Than A Level,0-35,0,60,West Midlands Region


#### Pilih fitur dari Tabel assesments

In [9]:
# ambil feature yang diperlukan pada dataframe assesments_frame
asses = pd.DataFrame(assesments_frame, columns=['id_assessment', 'assessment_type', 'date'])

# id perlu diubah menjadi string
asses['id_assessment'] = asses['id_assessment'].astype('string')

asses.head(5)

Unnamed: 0,id_assessment,assessment_type,date
0,1752,TMA,19.0
1,1753,TMA,54.0
2,1754,TMA,117.0
3,1755,TMA,166.0
4,1756,TMA,215.0


#### Pilih fitur dari Tabel studentAssesment

In [10]:
# ambil feature yang diperlukan pada dataframe assesments_frame
stdAsses = pd.DataFrame(stdAsses_frame, columns=['id_assessment', 'id_student', 'score'])

# id perlu diubah menjadi string
stdAsses['id_assessment'] = stdAsses['id_assessment'].astype('string')
stdAsses['id_student'] = stdAsses['id_student'].astype('string')

stdAsses.head(5)

Unnamed: 0,id_assessment,id_student,score
0,1752,11391,78.0
1,1752,28400,70.0
2,1752,31604,72.0
3,1752,32885,69.0
4,1752,38053,79.0


### Handling Null Values

#### Null values pada tabel course

In [11]:
pd.DataFrame(courses.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
code_presentation,0
code_module,0
module_presentation_length,0


tidak ditemukan null values --> aman

#### Null values pada tabel studentInfo

In [12]:
pd.DataFrame(stdInfo.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
imd_band,1111
id_student,0
code_module,0
final_result,0
gender,0


imd_band akan didrop karena tidak digunakan dalam analisis

In [13]:
stdInfo = stdInfo.drop(['imd_band'], axis=1)

In [14]:
pd.DataFrame(stdInfo.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
id_student,0
code_module,0
final_result,0
gender,0
highest_education,0


#### Null values pada tabel assesments

In [15]:
pd.DataFrame(asses.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
date,11
id_assessment,0
assessment_type,0


date akan didrop karena tidak digunakan dalam analisis

In [16]:
asses = asses.drop(['date'], axis=1)

In [17]:
pd.DataFrame(asses.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
id_assessment,0
assessment_type,0


#### Null values pada tabel studentAssesment

In [18]:
pd.DataFrame(stdAsses.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
score,173
id_assessment,0
id_student,0


Fill missing values dengan mean

In [19]:
stdAsses["score"]=stdAsses["score"].fillna(np.mean(stdAsses["score"]))

Cek ulang missing values

In [20]:
pd.DataFrame(stdAsses.isnull().sum().sort_values(ascending=False)).head()

Unnamed: 0,0
id_assessment,0
id_student,0
score,0


Sudah tidak ada missing values

In [21]:
stdAsses['score'] = round(stdAsses['score'])
stdAsses = stdAsses.astype({'score':'int'})
stdAsses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id_assessment  173912 non-null  string
 1   id_student     173912 non-null  string
 2   score          173912 non-null  int32 
dtypes: int32(1), string(2)
memory usage: 3.3 MB


## Create Session

In [22]:
session = tt.Session()

#### Load data dari pandas frame ke sebuah in-memory yang disebut "table"

In [23]:
course_table = session.read_pandas(courses, table_name="Courses")
course_table.head()

Unnamed: 0,code_presentation,code_module,module_presentation_length
0,2013J,AAA,268
1,2014B,EEE,241
2,2014J,AAA,269
3,2014J,CCC,269
4,2013J,EEE,268


In [24]:
stdInfo_table = session.read_pandas(stdInfo, table_name="Student Info", keys=['id_student'])
stdInfo_table.head()

Unnamed: 0_level_0,code_module,final_result,gender,highest_education,age_band,num_of_prev_attempts,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
45462,AAA,Pass,M,HE Qualification,0-35,0,60,Scotland
63400,AAA,Pass,M,Lower Than A Level,35-55,0,60,Scotland
106577,AAA,Pass,M,Lower Than A Level,0-35,0,60,East Midlands Region
118983,AAA,Pass,M,HE Qualification,0-35,0,120,East Midlands Region
137873,AAA,Pass,M,A Level or Equivalent,35-55,0,60,South West Region


In [25]:
asses_table = session.read_pandas(asses, table_name="Assessments", keys=['id_assessment'])
asses_table.head()

Unnamed: 0_level_0,assessment_type
id_assessment,Unnamed: 1_level_1
1758,TMA
14993,CMA
14989,TMA
14997,TMA
15017,CMA


In [26]:
stdAsses_table = session.read_pandas(stdAsses, table_name="Student Assessment")
stdAsses_table.head()

Unnamed: 0,id_assessment,id_student,score
0,1752,11391,78
1,1752,100893,63
2,1752,106247,67
3,1752,127582,69
4,1752,142326,65


## Join

Lakukan join table Courses dan Student Info untuk membuat cube

In [27]:
stdInfo_table.join(course_table, mapping={"code_module": "code_module"})

In [28]:
stdInfo_table.head()

Unnamed: 0_level_0,code_module,final_result,gender,highest_education,age_band,num_of_prev_attempts,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
45462,AAA,Pass,M,HE Qualification,0-35,0,60,Scotland
63400,AAA,Pass,M,Lower Than A Level,35-55,0,60,Scotland
106577,AAA,Pass,M,Lower Than A Level,0-35,0,60,East Midlands Region
118983,AAA,Pass,M,HE Qualification,0-35,0,120,East Midlands Region
137873,AAA,Pass,M,A Level or Equivalent,35-55,0,60,South West Region


### Create Cube

In [29]:
cube1 = session.create_cube(stdInfo_table)

In [30]:
# Aliasing the hierarchies property to a shorter variable name because we will use it a lot.
h = cube1.hierarchies
l = cube1.levels
m = cube1.measures

In [31]:
cube1

In [32]:
cube2 = session.create_cube(stdAsses_table)

In [33]:
# Aliasing the hierarchies property to a shorter variable name because we will use it a lot.
h2 = cube2.hierarchies
l2 = cube2.levels
m2 = cube2.measures

In [34]:
cube2

## Measures, Slice, dan Dice

Coba mencari measure berupa: Mean dari studied_credits (SKS yang diambil):

In [35]:
cube1.query(m["studied_credits.MEAN"])

Unnamed: 0,studied_credits.MEAN
0,78.34


#### DICE untuk Mean studied_credits per Jenis final_result

Coba DICE cube untuk mencari measure berupa: Mean dari studied_credits (SKS yang diambil) tiap jenis final_result:

In [36]:
cube1.query(m["studied_credits.MEAN"], levels=[l["final_result"]])

Unnamed: 0_level_0,studied_credits.MEAN
final_result,Unnamed: 1_level_1
Distinction,71.05
Fail,75.81
Pass,74.18
Withdrawn,89.35


Pada tabel di atas dapat dilihat bahwa student yang mengambil lebih banyak SKS, akan rentan Withdrawn (mengundurkan diri) dan Fail (gagal).

Sementara itu, terlihat bahwa student dengan predikat Distinction (Istimewa) dan Pass (Lulus), mengambil lebih sedikit SKS.

#### SLICE dan DICE untuk melihat banyak student withdrawn pada tiap code_module dan code_presentation

In [37]:
cube1.query(
    m["studied_credits.MEAN"],
    levels=[l["code_module"], l["code_presentation"]],
    filter=l["final_result"] == "Withdrawn",
)

Unnamed: 0_level_0,Unnamed: 1_level_0,studied_credits.MEAN
code_module,code_presentation,Unnamed: 2_level_1
AAA,2014J,103.21
BBB,2013J,89.93
CCC,2014J,76.87
DDD,2013B,96.55
EEE,2013J,73.03
FFF,2014B,98.1
GGG,2013J,38.43


In [38]:
session.visualize()

Terlihat bahwa, hanya pada student dengan umur lebih atau = 55 tahun, student yang Fail rata-rata hanya mengambil sedikit SKS. Terlihat bahwa pada usia =>55 tahun, pemegaruhnya tidak hanya jumlah SKS yang diambil

## Drilldown and Roll Up

In [39]:
session.visualize()

In [40]:
session.visualize()

# Cube ke-2

In [41]:
stdInfo_table_2 = session.read_pandas(stdInfo, table_name="Student Info 2", keys=['id_student'])
stdInfo_table_2.head(2)

Unnamed: 0_level_0,code_module,final_result,gender,highest_education,age_band,num_of_prev_attempts,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
45462,AAA,Pass,M,HE Qualification,0-35,0,60,Scotland
63400,AAA,Pass,M,Lower Than A Level,35-55,0,60,Scotland


In [42]:
asses_table_2 = session.read_pandas(asses, table_name="Assessments 2", keys=['id_assessment'])
asses_table_2.head(2)

Unnamed: 0_level_0,assessment_type
id_assessment,Unnamed: 1_level_1
1758,TMA
14993,CMA


In [43]:
stdAsses_table_2 = session.read_pandas(stdAsses, table_name="Student Assessment 2")
stdAsses_table_2.head(2)

Unnamed: 0,id_assessment,id_student,score
0,1752,11391,78
1,1752,100893,63


In [44]:
course_table_2 = session.read_pandas(courses, table_name="Courses 2")
course_table_2.head(2)

Unnamed: 0,code_presentation,code_module,module_presentation_length
0,2013J,AAA,268
1,2014B,EEE,241


In [45]:
stdAsses_table_2.join(asses_table_2, mapping={"id_assessment": "id_assessment"})

In [46]:
stdInfo_table_2.join(stdAsses_table_2, mapping={"id_student": "id_student"})

In [47]:
stdInfo_table_2.join(course_table_2, mapping={"code_module": "code_module"})

In [48]:
cube2 = session.create_cube(stdInfo_table_2)

In [49]:
# Aliasing the hierarchies property to a shorter variable name because we will use it a lot.
h2 = cube2.hierarchies
l2 = cube2.levels
m2 = cube2.measures

In [50]:
cube2

In [51]:
session.visualize()