### Load Libraries

In [147]:
# Load dependencies
import pandas as pd
import numpy as np
import json
import os


### Read the txt with total users file

In [128]:
# Read all users
df_all_users = pd.read_csv('data/install_log.csv', names=['nickname', 'DateStart','GamificationStart'])
# Keep only the last installation to drop duplicate or truplicates
df_all_users = df_all_users.drop_duplicates(subset='nickname', keep='last')
df_all_users

Unnamed: 0,nickname,DateStart,GamificationStart
1,TheArk,2017-04-11,0
2,Federico,2017-04-11,1
3,evakuruczleki,2017-04-11,0
4,Shan,2017-04-11,1
5,Giada,2017-04-11,0
7,myrto,2017-04-11,0
8,ARDecre,2017-04-11,1
10,Shani,2017-04-11,1
11,MarioStefano,2017-04-11,0
12,Ricky,2017-04-12,1


### Read the JSON files from directory

In [2]:
# Read the directory with the data and save file_names in a list
path_to_json_files = 'data/'
json_files = [single_json for single_json in os.listdir(path_to_json_files) if single_json.endswith('.json')]

In [3]:
json_files

['Phrasebook_Dump_aggelos_NOGamification_2017_04_25.json',
 'Phrasebook_Dump_ARDecre_YESGamification_2017_04_23.json',
 'Phrasebook_Dump_Bill Bilalis_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_FedeCornalba_NOGamification_2017_04_11.json',
 'Phrasebook_Dump_FedeCornalba_YESGamification_2017_04_17.json',
 'Phrasebook_Dump_Federico_YESGamification_2017_04_26.json',
 'Phrasebook_Dump_Giada_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_MarioStefano_NOGamification_2017_04_23.json',
 'Phrasebook_Dump_myrto_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_TheArk_NOGamification_2017_04_21.json']

Keep filenames as reference for later feeding

### Usefull functions

In [92]:
# Create a function that gets the date of a datetime string
def getDate(s):
    first_split = s.split(' ')[0]
    return first_split

In [96]:
# Calculate the duration of use in days for each user
def getDuration(created, closed):
    return closed-created

In [114]:
# Create a function that gets the hour of a datetime string
def getHour(s):
    first_split = s.split(' ')[1]
    return int(first_split.split(':')[0])

### Read each file seperately

Change the filename of the below path by copying a filename from above

In [187]:
with open('data/Phrasebook_Dump_FedeCornalba_YESGamification_2017_04_17.json') as json_data:
    d = json.load(json_data)
    
    # Created unlocked badges pandas dataframe
    unlocked_badges = [(badge['badgeName'],badge['createdOn']) for badge in d['badges'] for kk in badge.keys() if kk=='createdOn']
    df_badges = pd.DataFrame(unlocked_badges,columns=['Name', 'Date'])
    
    # Load phrasebook data into a pandas dataframe
    df_phrasebook = pd.DataFrame(d['phrasebook'])  
    
    # Load challenges data into a pandas dataframe
    df_challenges = pd.DataFrame(d['challenges'])
    
    # Load user data into a pandas dataframe
    df_user = pd.DataFrame(d['user'])
    df_user.columns = ['GamificationEnd', 'DateEnd', 'foreignLanguage', 'hasSwitchedVersion', 'level', 'motherLanguage', 'nickname', 'totalXP']
    # Merge with the install_log file
    df_user = pd.merge(df_user,df_all_users,on='nickname',how='inner')
    # Now create a new column named year and store the values of the above function
    df_user['DateEnd']= df_user['DateEnd'].apply(lambda x: getDate(x))
    df_user['DateEnd'] = pd.to_datetime(df_user['DateEnd'])
    df_user['DateStart'] = pd.to_datetime(df_user['DateStart'])
    df_user['daysUsed'] = df_user[['DateStart','DateEnd']].apply(lambda x: getDuration(*x), axis=1)
    # Rearrange user dataframe 
    df_user = df_user[['nickname','motherLanguage','foreignLanguage','GamificationStart','GamificationEnd','hasSwitchedVersion', 'DateStart', 'DateEnd', 'daysUsed', 'totalXP', 'level']]

### Check data for each user

In [188]:
df_user

Unnamed: 0,nickname,motherLanguage,foreignLanguage,GamificationStart,GamificationEnd,hasSwitchedVersion,DateStart,DateEnd,daysUsed,totalXP,level
0,FedeCornalba,GERMAN,RUSSIAN,1,1,1,2017-03-22,2017-04-11,20 days,420,4


In [189]:
df_phrasebook

Unnamed: 0,archived,correctCount,createdOn,foreignLangString,id,motherLangString
0,1,9,2017-03-22 16:05:26,жизнь,1,das leben
1,1,8,2017-03-22 16:08:03,имя,2,der name
2,1,3,2017-03-22 16:14:27,любовь,3,die liebe
3,1,7,2017-03-22 21:51:14,язык,4,die sprache
4,1,3,2017-03-23 22:54:18,пиво,5,das bier
5,0,0,2017-03-23 23:21:46,друг,6,der freund
6,1,7,2017-03-23 23:22:08,подруга,7,die freundin
7,1,5,2017-03-23 23:22:38,телефон,8,das handy
8,0,1,2017-03-23 23:24:12,окно,9,das fenster
9,1,7,2017-03-23 23:25:01,очки,10,die brille


In [190]:
df_challenges

Unnamed: 0,correct,createdOn,id,phraseId
0,1,2017-03-22 16:08:14,1,1
1,1,2017-03-22 16:08:21,2,2
2,1,2017-03-22 21:51:32,3,1
3,1,2017-03-22 21:51:45,4,2
4,1,2017-04-08 13:20:49,5,1
5,1,2017-04-08 13:20:58,6,4
6,0,2017-04-08 13:21:16,7,18
7,1,2017-04-08 13:39:12,8,10
8,1,2017-04-08 13:39:21,9,9
9,1,2017-04-08 13:39:30,10,8


In [191]:
df_badges

Unnamed: 0,Name,Date
0,Doing Good,2017-04-11 01:26:59
1,Keep Going,2017-04-11 01:25:04
2,Night Owl,2017-04-12 23:31:45
3,No Sleep,2017-04-11 01:23:17
4,Extreme Stamina,2017-04-11 01:26:10


### Start analysing

We can analyze a frequency of activity both for adding words and practicing, by clearly distinguishing users with/without gamification.    
Users with good data are:
* TheArk
* MarioStefano
* FedeCornalba
* GiadaConfortola

We will start with FedeCornalba as he is the only complete user of our dataset.

In [193]:
def avg_input(gamification):
    if gamification == 'NO':
        gamification = 'WITHOUT'
    else:
        gamification = 'WITH'
    
    # Split date in a seperate column
    df_phrasebook['Date']= df_phrasebook['createdOn'].apply(lambda x: getDate(x))
    df_phrasebook['Date'] = pd.to_datetime(df_phrasebook['Date'])

    # Create a variable that stores the total number of days from first to the last input
    diff = df_phrasebook['Date'].iloc[-1] - df_phrasebook['Date'].iloc[0]
    diff = (diff / np.timedelta64(1, 'D')).astype(int)
    
    # Count the average input per day without gamification
    avg_input = len(df_phrasebook)/float(diff)
    return 'The average phrase input per day of user %s %s gamification is %.2f' % (df_user.nickname[0], gamification, avg_input)

In [186]:
avg_input('NO')

u'The average phrase input per day of user FedeCornalba WITHOUT gamification is 1.29'

Now if we go up and change the input file to the one with gamification and calculate the same average we get the following:

In [194]:
avg_input('YES')

u'The average phrase input per day of user FedeCornalba WITH gamification is 0.96'

## Discussion

It is just an initial version of the analysis. I will further investigate and create plots at a later stage but first I need to know if I am in the correct path. Please add any comments for improvemets and later analysis here. As you see the tables are a bit complicated to analyze as we don't have enough data for a summary for all users. Play around by checking all JSON filenames. 