In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

# import warnings
# warnings.filterwarnings('ignore')

***All Common***

In [25]:
def find_values_diff(col1, col2):    
    col1_col2 = np.setdiff1d(col1, col2)
    col2_col1 = np.setdiff1d(col2, col1)
    
    return np.concatenate((col1_col2, col2_col1))

#### Начнем исследовать данные

Посмотрю на каждую таблицу и кратенько отмечую интересные моменты

In [9]:
assessments = pd.read_csv('assessments.csv') # Информация об оценках в модулях
resultAssessments = pd.read_csv('studentAssessment.csv') # Сами оценки
courses = pd.read_csv('courses.csv') # Список модулей
studentInfo = pd.read_csv('studentInfo.csv') # Общее инфо о студенте
registration = pd.read_csv('studentRegistration.csv') # Инфа о регистрации студента на модули
studentVle = pd.read_csv('studentVle.csv') # Взаимодействие студента с виртуальной средой обучения
vle = pd.read_csv('vle.csv') # Доступные материалы в виртуальной среде обучения

##### Сначала глянем assessments и resultAssessments

In [15]:
# assessments
print(assessments.info())
assessments.sample(7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code_module        206 non-null    object 
 1   code_presentation  206 non-null    object 
 2   id_assessment      206 non-null    int64  
 3   assessment_type    206 non-null    object 
 4   date               195 non-null    float64
 5   weight             206 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 9.8+ KB
None


Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
177,GGG,2013J,37419,CMA,229.0,0.0
62,CCC,2014B,24290,Exam,,100.0
37,BBB,2014B,15016,CMA,82.0,1.0
66,CCC,2014J,24297,CMA,144.0,8.0
105,DDD,2014J,25365,TMA,111.0,25.0
4,AAA,2013J,1756,TMA,215.0,30.0
43,BBB,2014B,15010,TMA,82.0,18.0


In [16]:
# resultAssessments
print(resultAssessments.info()) # Без пропусков - это хорошо
resultAssessments.sample(7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   id_assessment   173912 non-null  int64
 1   id_student      173912 non-null  int64
 2   date_submitted  173912 non-null  int64
 3   is_banked       173912 non-null  int64
dtypes: int64(4)
memory usage: 5.3 MB
None


Unnamed: 0,id_assessment,id_student,date_submitted,is_banked
4233,14984,550708,18,0
47083,24282,2488382,32,0
167857,37430,614419,147,0
172431,37440,276893,192,0
51992,24288,625108,139,0
124930,34878,610687,6,0
3617,14984,468021,19,0


In [69]:
absent_ids = find_values_diff(resultAssessments["id_assessment"], assessments["id_assessment"])
assessments[assessments["id_assessment"].isin(absent_ids)] # Остутствуют экзамены

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
5,AAA,2013J,1757,Exam,,100.0
11,AAA,2014J,1763,Exam,,100.0
23,BBB,2013B,14990,Exam,,100.0
35,BBB,2013J,15002,Exam,,100.0
47,BBB,2014B,15014,Exam,,100.0
53,BBB,2014J,15025,Exam,,100.0
63,CCC,2014B,40087,Exam,,100.0
73,CCC,2014J,40088,Exam,,100.0
113,EEE,2013J,30713,Exam,235.0,100.0
118,EEE,2014B,30718,Exam,228.0,100.0


In [44]:
group = assessments.groupby("assessment_type").count()
group[group.index == "Exam"] # Приэтом для каких-то 6 экзаменов оценки есть. Ладно(

Unnamed: 0_level_0,code_module,code_presentation,id_assessment,date,weight
assessment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Exam,24,24,24,13,24


In [74]:
# Объединим эти таблицы
assesmentsFullInfo = pd.merge(
    resultAssessments, 
    assessments,
    how="inner",
    on="id_assessment",
    )
assert(
    (len(assessments.columns) + len(resultAssessments.columns) - 1) == len(assesmentsFullInfo.columns)
    )
assert(
    assesmentsFullInfo.shape[0] == resultAssessments.shape[0]
)
print(assesmentsFullInfo.info())
assesmentsFullInfo.sample(7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173912 entries, 0 to 173911
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id_assessment      173912 non-null  int64  
 1   id_student         173912 non-null  int64  
 2   date_submitted     173912 non-null  int64  
 3   is_banked          173912 non-null  int64  
 4   code_module        173912 non-null  object 
 5   code_presentation  173912 non-null  object 
 6   assessment_type    173912 non-null  object 
 7   date               171047 non-null  float64
 8   weight             173912 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 11.9+ MB
None


Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,code_module,code_presentation,assessment_type,date,weight
25868,15005,1440341,133,0,BBB,2013J,CMA,131.0,1.0
133125,34886,2606636,23,0,FFF,2014B,TMA,24.0,12.5
16714,14997,527799,46,0,BBB,2013J,TMA,47.0,18.0
66321,25335,204996,52,0,DDD,2013B,TMA,53.0,10.0
146450,34901,2301830,93,0,FFF,2014J,TMA,94.0,25.0
95454,25368,285481,243,0,DDD,2014J,Exam,,100.0
143371,34899,678954,22,0,FFF,2014J,TMA,24.0,12.5


In [78]:
assesmentsFullInfo["is_banked"].value_counts() # Выкинем этот признак, у него слишком плохое распределение
assesmentsFullInfo.drop("is_banked", inplace=True, axis=1)