### Load Libraries

In [99]:
# Load dependencies
import pandas as pd
import numpy as np
import json
import os

### Read the txt with total users file

In [2]:
# Read all users
df_all_users = pd.read_csv('data/install_log.csv', names=['nickname', 'DateStart','GamificationStart'])
# Keep only the last installation to drop duplicate or truplicates
df_all_users = df_all_users.drop_duplicates(subset='nickname', keep='last')
df_all_users

Unnamed: 0,nickname,DateStart,GamificationStart
1,TheArk,2017-04-11,0
2,Federico,2017-04-11,1
3,evakuruczleki,2017-04-11,0
4,Shan,2017-04-11,1
5,Giada,2017-04-11,0
7,myrto,2017-04-11,0
8,ARDecre,2017-04-11,1
10,Shani,2017-04-11,1
11,MarioStefano,2017-04-11,0
12,Ricky,2017-04-12,1


### Read the JSON files from directory

In [3]:
# Read the directory with the data and save file_names in a list
path_to_json_files = 'data/'
json_files = [single_json for single_json in os.listdir(path_to_json_files) if single_json.endswith('.json')]

In [4]:
json_files

['Phrasebook_Dump_aggelos_NOGamification_2017_04_25.json',
 'Phrasebook_Dump_ARDecre_YESGamification_2017_04_23.json',
 'Phrasebook_Dump_Bill Bilalis_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_FedeCornalba_NOGamification_2017_04_11.json',
 'Phrasebook_Dump_FedeCornalba_YESGamification_2017_04_17.json',
 'Phrasebook_Dump_Federico_YESGamification_2017_04_26.json',
 'Phrasebook_Dump_Giada_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_MarioStefano_NOGamification_2017_04_23.json',
 'Phrasebook_Dump_myrto_NOGamification_2017_04_22.json',
 'Phrasebook_Dump_TheArk_NOGamification_2017_04_21.json']

Keep filenames as reference for later feeding

### Usefull functions

In [107]:
# Create a function that gets the date from filename title
def getEndDate(s):
    first_split = s.split('_')[4]
    second_split = s.split('_')[5]
    third_split = s.split('_')[6][:-5]
    return first_split + '-' + second_split + '-' + third_split

In [5]:
# Create a function that gets the YES/NO Gamification from filename title
def getGamification(s):
    first_split = s.split('_')[3]
    return first_split[:-12]

In [6]:
# Create a function that gets the nickname from filename title
def getNickname(s):
    first_split = s.split('_')[2]
    return first_split

In [7]:
# Create a function that gets the date of a datetime string
def getDate(s):
    first_split = s.split(' ')[0]
    return first_split

In [8]:
# Calculate the duration of use in days for each user
def getDuration(created, closed):
    return closed-created

In [9]:
# Create a function that gets the hour of a datetime string
def getHour(s):
    first_split = s.split(' ')[1]
    return int(first_split.split(':')[0])

### Read each file seperately

In [109]:
# Create function per user with nickname and gamification flagger parameter
def loadUserInfo(nick,flagger):
    for filename in json_files:
        if getNickname(filename) == nick and getGamification(filename)== flagger:
            with open('data/'+filename) as json_data:
                d = json.load(json_data)

                # Created unlocked badges pandas dataframe
                unlocked_badges = [(badge['badgeName'],badge['createdOn']) for badge in d['badges'] for kk in badge.keys() if kk=='createdOn']
                df_badges = pd.DataFrame(unlocked_badges,columns=['Name', 'Date'])

                # Load phrasebook data into a pandas dataframe
                df_phrasebook = pd.DataFrame(d['phrasebook'])  

                # Load challenges data into a pandas dataframe
                df_challenges = pd.DataFrame(d['challenges'])

                # Load user data into a pandas dataframe
                df_user = pd.DataFrame(d['user'])
                df_user.columns = ['GamificationEnd', 'DateEnd', 'foreignLanguage', 'hasSwitchedVersion', 'level', 'motherLanguage', 'nickname', 'totalXP']
                # Merge with the install_log file
                df_user = pd.merge(df_user,df_all_users,on='nickname',how='inner')
                # Now create a new column named year and store the values of the above function
                # Change dateEnd
                df_user['DateEnd']= getEndDate(filename)
                df_user['DateEnd'] = pd.to_datetime(df_user['DateEnd'])
                df_user['DateStart'] = pd.to_datetime(df_user['DateStart'])
                df_user['daysUsed'] = df_user[['DateStart','DateEnd']].apply(lambda x: getDuration(*x), axis=1)
                # Rearrange user dataframe 
                df_user = df_user[['nickname','motherLanguage','foreignLanguage','GamificationStart','GamificationEnd','hasSwitchedVersion', 'DateStart', 'DateEnd', 'daysUsed', 'totalXP', 'level']]
            
            return df_user, df_phrasebook, df_challenges, df_badges

So when we want to reach a user's info, phrasebook, challenges table and badges table we should act adequately:
   * **user info**: loadUserInfo(nickname, flagger)[0]
   * **phrasebook**: loadUserInfo(nickname, flagger)[1]
   * **challenges**: loadUserInfo(nickname, flagger)[2]
   * **badges**: loadUserInfo(nickname, flagger)[3]

### Check data for each user (examples)

In [110]:
# If we want the user info for ARDecre WITH gamification we write 
loadUserInfo('ARDecre', 'YES')[0]

Unnamed: 0,nickname,motherLanguage,foreignLanguage,GamificationStart,GamificationEnd,hasSwitchedVersion,DateStart,DateEnd,daysUsed,totalXP,level
0,ARDecre,ITALIANO,INGLESE,1,1,0,2017-04-11,2017-04-23,12 days,10,0


In [111]:
# If we want the phrasebook of Federico WITH gamification we write
loadUserInfo('Federico', 'YES')[1]

Unnamed: 0,archived,correctCount,createdOn,foreignLangString,id,motherLangString
0,0,0,2017-04-11 10:14:54,自転車,1,bicicletta
1,0,1,2017-04-11 10:15:24,起こす,2,svegliare qualcuno
2,0,-1,2017-04-11 10:16:39,授業中（ちゅう）,3,durante la lezione


In [112]:
# If we want the challenges of MarioStefano WITHOUT gamification we write
loadUserInfo('MarioStefano','NO')[2]

Unnamed: 0,correct,createdOn,id,phraseId
0,1,2017-04-18 15:49:14,1,2
1,0,2017-04-18 15:49:28,2,4
2,1,2017-04-18 15:49:35,3,2
3,0,2017-04-18 15:49:47,4,4
4,0,2017-04-18 17:47:35,5,5


In [91]:
# If we want the badges of FedeCornalba WITH gamification we write
loadUserInfo('FedeCornalba', 'YES')[3]

Unnamed: 0,Name,Date
0,Doing Good,2017-04-11 01:26:59
1,Keep Going,2017-04-11 01:25:04
2,Night Owl,2017-04-12 23:31:45
3,No Sleep,2017-04-11 01:23:17
4,Extreme Stamina,2017-04-11 01:26:10


### Start analysing

We can analyze a frequency of activity both for adding words and practicing, by clearly distinguishing users with/without gamification.    
Users with good data are:
* TheArk
* MarioStefano
* FedeCornalba
* GiadaConfortola

We will start with FedeCornalba as he is the only complete user of our dataset.

In [159]:
# Create a fuction that calculate the average input words on phrasebook per user
def avg_input(nick, flagger):
    if flagger == 'NO':
        gamification = 'WITHOUT'
    else:
        gamification = 'WITH'
    
    # Crete helper df
    df_helper = pd.DataFrame()
    
    if (loadUserInfo(nick,flagger)[0]['GamificationStart'].values[0]==0 and flagger == 'NO') or (loadUserInfo(nick,flagger)[0]['GamificationStart'].values[0]==1 and flagger == 'YES'):
        # Split date in a seperate column
        df_helper['Date']= loadUserInfo(nick,flagger)[1]['createdOn'].apply(lambda x: getDate(x))
        df_helper['Date'] = pd.to_datetime(df_helper['Date'])

        # Create a variable that stores the total number of days from first to the last input
        diff = df_helper['Date'].iloc[-1] - df_helper['Date'].iloc[0]
        diff = (diff / np.timedelta64(1, 'D')).astype(int)

        # Count the average input per day without gamification
        avg_input = len(df_helper)/float(diff)
        return 'The average phrase input per day of user %s %s gamification is %.2f' % (nick, gamification, avg_input)
    
    if (loadUserInfo(nick,flagger)[0]['GamificationStart'].values[0]==0 and flagger == 'YES') or (loadUserInfo(nick,flagger)[0]['GamificationStart'].values[0]==1 and flagger == 'NO'):
        # Subtract the new rows after the switch
        big_index_list = list(loadUserInfo(nick,flagger)[1].index.values)
        small_index_list = list(loadUserInfo(nick,'NO')[1].index.values) # This should be changed if there is a 1 gamification start
        diff_index_list = [item for item in big_index_list if item not in small_index_list]
        df_diff = loadUserInfo(nick,flagger)[1].ix[diff_index_list]
        
        # Split date in a seperate column
        df_helper['Date']= df_diff['createdOn'].apply(lambda x: getDate(x))
        df_helper['Date'] = pd.to_datetime(df_helper['Date'])

        # Create a variable that stores the total number of days from first to the last input
        diff = df_helper['Date'].iloc[-1] - df_helper['Date'].iloc[0]
        diff = (diff / np.timedelta64(1, 'D')).astype(int)

        # Count the average input per day without gamification
        avg_input = len(df_helper)/float(diff)
        return 'The average phrase input per day of user %s %s gamification is %.2f' % (nick, gamification, avg_input)

The above function checks if the 

In [160]:
avg_input('FedeCornalba', 'NO')

'The average phrase input per day of user FedeCornalba WITHOUT gamification is 1.29'

In [161]:
avg_input('FedeCornalba', 'YES')

'The average phrase input per day of user FedeCornalba WITH gamification is 0.60'

## Discussion

It is just an initial version of the analysis. I will further investigate and create plots at a later stage but first I need to know if I am in the correct path. Please add any comments for improvemets and later analysis here. As you see the tables are a bit complicated to analyze as we don't have enough data for a summary for all users. Play around by checking all JSON filenames. 