**Data Generation File**

---

\

In [8]:
# Mount google drive to load the files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# Import library 
import numpy as np
from numpy import unique  # can also just use np.unique
import pandas as pd
import random

**Loading Files**

In [10]:
# Load practice exam consolidated file
pd.set_option('display.max_columns',None)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Capstone/Filtered_practice_attempt_log.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Org Defined ID,Username,Attempt,Attempt Start,Attempt End,Q No,Q Type,Bonus?,Marks,Total,Year,Topic,Question Type,Chapter,id,User,Time
0,0,11111111,22222222,1,04/05/20 12:10,04/05/20 12:55,1,MC,CORR : THEORY : Correlation symmetry,0.0,2,2020,CORR,THEORY,Correlation symmetry,19,Hmpivxct Sqcqnoqy,1
1,1,11111111,22222222,1,04/05/20 12:10,04/05/20 12:55,2,FIB,CLASS1 : CALC : Confusion Table,0.0,4,2020,CLASS1,CALC,Confusion Table,2,Hmpivxct Sqcqnoqy,7
2,2,11111111,22222222,1,04/05/20 12:10,04/05/20 12:55,3,MC,INTRO : THEORY : Which is not analytics method,2.0,2,2020,INTRO,THEORY,Which is not analytics method,28,Hmpivxct Sqcqnoqy,1
3,3,11111111,22222222,1,04/05/20 12:10,04/05/20 12:55,4,MC,LPSA : THEORY : Shadow price non-binding const...,0.0,2,2020,LPSA,THEORY,Shadow price non-binding constraint,41,Hmpivxct Sqcqnoqy,2
4,4,11111111,22222222,1,04/05/20 12:10,04/05/20 12:55,5,SA,MLR : CALC : Jewelry sales from window and value,0.0,4,2020,MLR,CALC,Jewelry sales from window and value,60,Hmpivxct Sqcqnoqy,2


In [11]:
# Load final exam consolidated file
pd.set_option('display.max_columns',None)
final = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Capstone/Merged_Final_Exam.csv')
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36390 entries, 0 to 36389
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      36390 non-null  int64  
 1   Org Defined ID  36390 non-null  int64  
 2   Username        36390 non-null  int64  
 3   Attempt         36390 non-null  int64  
 4   Attempt Start   36390 non-null  object 
 5   Attempt End     36390 non-null  object 
 6   Q No            36390 non-null  int64  
 7   Q Type          36390 non-null  object 
 8   Bonus?          36390 non-null  object 
 9   Marks           36390 non-null  float64
 10  Total           36390 non-null  float64
 11  Year            36390 non-null  int64  
 12  Topic           36390 non-null  object 
 13  Question Type   36390 non-null  object 
 14  Chapter         36390 non-null  object 
 15  id              36390 non-null  int64  
 16  User            36390 non-null  object 
 17  Time            36390 non-null 

**Time Duration Calculation**

In [13]:
# Time estimation for practice exam attempts

# Converting time into timestamp
df['Attempt Start'] = pd.to_datetime(df['Attempt Start'])
df['Attempt End'] = pd.to_datetime(df['Attempt End'])
# Taking difference between start and end of question
df['Total Time'] = df['Attempt End'] - df['Attempt Start']
# Converting time into total minutes
df['Total Time'] = [int(d.seconds / 60) for d in df['Total Time']]

In [15]:
# Time estimation for final exam attempt

# Converting time into timestamp
final['Attempt Start'] = pd.to_datetime(final['Attempt Start'])
final['Attempt End'] = pd.to_datetime(final['Attempt End'])
# Taking difference between start and end of question
final['Total Time'] = final['Attempt End'] - final['Attempt Start']
# Converting time into total minutes
final['Total Time'] = [int(d.seconds / 60) for d in final['Total Time']]

**Random selection of attempt for each student**

In [17]:
# Creating dataframe containing list of attempts per user 
df1 = df.groupby(['User','Year'])['Attempt'].unique().apply(list).reset_index()

# Creating a column 'Random' to which a random attempt would be allocated 
df1['Random'] = 1
df1

Unnamed: 0,User,Year,Attempt,Random
0,Aafmtore Ewidypgt,2020,"[1, 2, 3, 4, 5]",1
1,Aanicqpm Bkcjtfpi,2020,"[1, 3, 4, 5]",1
2,Abdbgfpp Wslxskbf,2022,"[2, 3, 4, 5]",1
3,Abiuxkue Linpnkla,2021,[3],1
4,Acferjam Ftqdojvl,2021,"[1, 3]",1
...,...,...,...,...
781,Zvqilppc Pscymyfg,2022,[1],1
782,Zwawscol Iddzxrgo,2020,"[1, 2, 3, 4]",1
783,Zwobkmoc Uencdjin,2020,"[1, 2]",1
784,Zwwnffgc Uahjkvbn,2021,"[1, 2]",1


**Creation of training set by extrapolating datasets (random cohorts) through practice attempts**

In [18]:
# Creating empty extrapolated dataset
train = pd.DataFrame(columns=['Students','Questions','Exam Duration','Total Marks',
                            'Average Time Taken','Difficulty','Mean','STD'])

# Creating a list of years from which the cohorts are taken from
year = list(df1['Year'].unique())

for y in year:

  # Filtering the data on year to separate out each cohort to then further extrapolate datasets
  randlist = df1[df1['Year'] == y]
  
  for i in range(30):
    # Choosing the random attempt for each student
    randlist['Random'] = [random.choice(d) for d in randlist['Attempt']]

    # Filtering the random attempts on the original dataset
    df2 = randlist.merge(df, left_on=["User","Random"], 
                         right_on = ["User","Attempt"], how='left', indicator=True)
    
    # Calculation of Difficulty Index
    dif = df2.groupby(['id']).agg({'Marks' : 'sum',
                                   'Total' : 'sum'}).sort_values('Marks').reset_index()

    dif['Difficulty'] = dif['Marks']/dif['Total']

    # Calculation of Mean and Standart Deviation
    df3 = df2.groupby(['User']).agg({'Marks' : 'sum',
                                     'Total' : 'sum',
                                     'Total Time' : 'unique'}).sort_values('Marks').reset_index()
    df3['Total'] = df3['Total'].max()
    df3['Marks'] = df3['Marks']/df3['Total']*100
    mu = df3['Marks'].describe()[1]
    sigma = df3['Marks'].describe()[2]
   
    # Appending the results of the new dataset into our training data
    train = train.append({'Students' : int(len(pd.unique(randlist['User']))),
                    'Questions' : int(len(pd.unique(df2['id']))),
                    'Exam Duration' : 120,'Total Marks' : 100,
                    'Average Time Taken' : df3['Total Time'].mean(),
                    'Difficulty' : dif['Difficulty'].mean(),'Mean' : mu,'STD' : sigma}, ignore_index=True)
    
    # Converting array element to integer
    train['Average Time Taken'] = [int(d) for d in train['Average Time Taken']]

train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Students,Questions,Exam Duration,Total Marks,Average Time Taken,Difficulty,Mean,STD
0,289,40,120,100,49,0.732050,69.086505,20.512093
1,289,40,120,100,51,0.736397,69.740484,20.633335
2,289,40,120,100,50,0.743361,70.311419,19.639810
3,289,40,120,100,51,0.741522,69.948097,19.644781
4,289,40,120,100,51,0.741977,70.176471,19.827205
...,...,...,...,...,...,...,...,...
85,257,35,120,100,59,0.632186,59.498171,21.193184
86,257,35,120,100,61,0.635504,59.730389,20.138129
87,257,35,120,100,60,0.638969,59.770661,19.754709
88,257,35,120,100,63,0.628259,59.098755,20.844968


In [16]:
# Extracting the extrapolated to a csv file
train.to_csv('extrapolated_dataset.csv')

**Creation of test set by creating data points from final exam results of 3 cohorts.**

In [14]:
# Creating empty extrapolated dataset
test = pd.DataFrame(columns=['Students','Questions','Exam Duration','Total Marks',
                            'Average Time Taken','Difficulty','Mean','STD'])
for y in year:

  # Filtering the data on year to separate out each cohort
  randlist = final[final['Year'] == y]

  # Calculation of Difficulty Index
  dif = randlist.groupby(['id']).agg({'Marks' : 'sum',
                                      'Total' : 'sum',}).sort_values('Marks').reset_index()
  dif['Difficulty'] = dif['Marks']/dif['Total']

  # Calculation of Mean and Standart Deviation
  df3 = randlist.groupby(['User']).agg({'Marks' : 'sum',
                                        'Total' : 'sum',
                                        'Total Time' : 'unique'}).sort_values('Marks').reset_index()
  df3['Total'] = df3['Total'].max()
  df3['Marks'] = df3['Marks']/df3['Total']*100
  mu = df3['Marks'].describe()[1]
  sigma = df3['Marks'].describe()[2]

  # Appending the results of the new dataset into our training data
  test = test.append({'Students' : int(len(pd.unique(randlist['User']))),
                    'Questions' : int(len(pd.unique(randlist['id']))),
                    'Exam Duration' : 120,'Total Marks' : 100,
                    'Average Time Taken' : df3['Total Time'].mean(),
                    'Difficulty' : dif['Difficulty'].mean(),'Mean' : mu,'STD' : sigma}, ignore_index=True)

  # Converting array element to integer
  test['Average Time Taken'] = [int(d) for d in test['Average Time Taken']]

test

Unnamed: 0,Students,Questions,Exam Duration,Total Marks,Average Time Taken,Difficulty,Mean,STD
0,324,35,120,100,109,0.579256,56.624599,18.209648
1,328,37,120,100,109,0.550915,53.207138,15.63238
2,371,35,120,100,112,0.614567,55.533666,16.420352


In [15]:
# Extracting the extrapolated to a csv file
test.to_csv('test_dataset.csv')