In [2]:
import os
import re
import glob
import pandas as pd
import scipy

In [3]:
os.chdir('/Users/qiongli/Dropbox/Dissertation/Data Analysis/freq_Control')
# os.getcwd()

# Type-Token analysis

In [4]:
# Four steps:
#1 Create a dictionary with file names as the keys and the chat text as values.
#2 Separate native speaker and L2 learner dataset.
#3 Extract the W + SFP constructions from the NS and L2 dataset, respectively.
#4 Calculate the type-token frequency of W + SFP construction
# Note: SFP: sentence final particles including a, ba, and ne.

# Step 1: 
# Create a dictionary: file name (key) --> text data (value)

In [5]:
def get_dic(path):
    # takes file path as input and return a dictionary 
    # with file name as key and the chat data as the value.
    fname=glob.glob(path)
    dic={}
    for f in fname:
        with open(f) as file:
            data=file.read()
        dic[f]=data
    return dic

In [6]:
# Define all the path or six chat sessions:
path_wk1 = 'WK1*.txt'
path_wk2 = 'WK2*.txt'
path_wk3 = 'WK3*.txt'
path_wk4 = 'WK4*.txt'
path_wk5 = 'WK5*.txt'
path_wk6 = 'WK6*.txt'
path_all = 'WK*.txt'
#get_dic(path_wk6)

# Step 2:
# Separate NS and L2 data

In [7]:
def get_NS_text(text):
    # gets all the native speaker (NS) lines from a text
    NS_text=''
    for line in text.splitlines():
        if line.startswith('NS'):
            NS_text+=line
    return NS_text

def get_L2_text(text):
    # gets all the L2 learner (L2) lines from a text
    L2_text=''
    for line in text.splitlines():
        if line.startswith('L2'):
            L2_text+=line
    return L2_text 

# Step 3:
# Get SFPs a, ba, and ne from NS and L2 datasets

In [8]:
# Get W + a
def get_a_NS(path):
    # gets SFP a from NS datasets in each week
    # the input path determines which week of data to analyze
    dic = get_dic(path)
    NS_text=''
    for key in dic.keys():
        text=dic[key]
        NS_text+=get_NS_text(text)
    li = re.findall('\w{1,3}啊', NS_text)
    return li
#print(get_a_NS(path_wk6))

def get_a_L2(path):
    # gets SFP a from L2 datasets in each week
    # the input path determines which week of data to analyze
    dic = get_dic(path)
    L2_text=''
    for key in dic.keys():
        text=dic[key]
        L2_text+=get_L2_text(text)
    li = re.findall('\w{1,3}啊', L2_text)
    return li
#print(get_a_L2(path_wk6))

In [9]:
# Get W + ba
def get_ba_NS(path):
    dic = get_dic(path)
    NS_text=''
    for key in dic.keys():
        text=dic[key]
        NS_text+=get_NS_text(text)
    li = re.findall('\w{1,3}吧', NS_text)
    return li

def get_ba_L2(path):
    dic = get_dic(path)
    L2_text=''
    for key in dic.keys():
        text=dic[key]
        L2_text+=get_L2_text(text)
    li = re.findall('\w{1,3}吧', L2_text)
    return li
#print(get_ba_NS(path_wk6))
#print(get_ba_L2(path_wk6))

In [10]:
# Get W + ne
def get_ne_NS(path):
    dic = get_dic(path)
    NS_text=''
    for key in dic.keys():
        text=dic[key]
        NS_text+=get_NS_text(text)
    li = re.findall('\w{1,3}呢', NS_text)
    return li

def get_ne_L2(path):
    dic = get_dic(path)
    L2_text=''
    for key in dic.keys():
        text=dic[key]
        L2_text+=get_L2_text(text)
    li = re.findall('\w{1,3}呢', L2_text)
    return li
#print(get_ne_NS(path_wk6))
#print(get_ne_L2(path_wk6))

# Step 4: 
# Token frequency, type frequency, type-token ratio

In [11]:
def get_ratio(li):
    # takes a list and calculates its type-token ratio
    ratio = len(set(li))/len(li)
    return ratio

# W + a

In [12]:
# Get a list of W + a from the NS dataset in each phase
NS_a1=get_a_NS(path_wk1)+get_a_NS(path_wk2) # Phase 1
NS_a2=get_a_NS(path_wk3)+get_a_NS(path_wk4) # Phase 2 
NS_a3=get_a_NS(path_wk5)+get_a_NS(path_wk6) # Phase 3

# Get a list of W + a from the L2 dataset in each phase
L2_a1=get_a_L2(path_wk1)+get_a_L2(path_wk2)
L2_a2=get_a_L2(path_wk3)+get_a_L2(path_wk4)
L2_a3=get_a_L2(path_wk5)+get_a_L2(path_wk6)

In [13]:
# adjust the NS_a1 list. Remove the unnecessary words to get the W + a construction
NS_a1_adjusted = ['搭配啊', '玩耍啊', '我啊', '是啊', '打算啊', '快啊', '好啊', '实习啊', '人啊', '这样啊', '重庆啊', '好啊', 
                  '酷啊', '棒啊', '不错啊', '对啊', '好啊', '了啊', '的啊', '研究啊', '星期天啊', '课啊', '课啊', '课啊', 
                  '关系啊', '文献啊', '看啊', '研究啊', '是啊', '你好啊', '你啊', '是啊', '感觉啊', '说法啊', '加油啊', 
                  '巴士啊', '人啊', '你好啊', '去啊', '春假啊', '可爱啊', '懵状啊', '什么啊', '对啊', '了啊', '做啊', '哪啊', 
                  '工作啊', '放假啊', '电视啊', '电影啊', '对啊', '好啊', '是啊', '对啊', '对啊', '对啊', '棒啊', '有啊', 
                  '学啊', '时候啊', '春假啊', '是啊', '你好啊', '学校啊', '不是啊']
NS_a2_adjusted = ['你好啊', '地方啊', '什么啊', '这样啊', '是啊', '是啊', '是啊', '不同啊', '早啊', '了啊', '创意啊', '花园啊', 
                  '啥啊', '好啊', '圣诞节啊', '可爱啊', '人啊', '是啊', '没事啊', '是啊', '是啊', '是啊', '圣诞节啊', '什么样啊', 
                  '赞啊', '你好啊', '对啊', '干嘛啊', '好亏啊', '几天啊', '对啊', '方便啊', '多啊', '浙江啊', '苏州啊', '南京啊', 
                  '了啊', '峡谷啊', '公园啊', '了啊', '山珍啊', '野味啊', '对啊', '人啊', '的啊', '马桶啊', '张家界啊', '湖南啊', 
                  '难啊', '这样啊', '不错啊', '宝贵啊', '这样啊', '中文啊', '的啊', '棒啊', '衣服啊', '贵啊', '你好啊', '有缘啊', 
                  '是啊', '开心啊', '什么啊', '庆祝啊'] # 2 '是啊是啊‘ from the NS_a2 counted as 4 '是啊’ in the adjusted list
NS_a3_adjusted = ['作业啊', '这样啊', '对啊', '是啊', '是啊', '尝尝啊', '你好啊', '识你啊', '丰富啊', '我想啊', '这样啊', 
                  '很棒啊', '意思啊', '好棒啊', '多久啊', '为啥啊', '不给啊', '活动啊', '很好啊', '不少啊', '识你啊', '新年啊', 
                  '辛苦啊', '厉害啊', '是啊', '是啊', '对啊', '哪里啊', '对啊', '对啊', '是啊', '对啊', '是啊', '多久啊', 
                  '有啊', '得妙啊', '是啊', '好啊', '好玩啊', '是啊', '网站啊', '记得啊', '旅游啊', '对啊', '对啊', '是你啊', 
                  '聊啊', '莱尔啊', '这样啊', '课啊', '圣诞节啊', '好奇啊', '火鸡啊', '几了啊', '上好啊', '丰富啊', '样子啊', 
                  '故事啊', '鞭炮啊', '这样啊'] # 2 '对啊对啊‘ from the NS_a2 counted as 4 '对啊’ in the adjusted list

In [14]:
# NS Type-token ratio of a in each phase
NS_ratio_a1 = get_ratio(NS_a1_adjusted)
print(NS_ratio_a1) # type-token ration in Phase 1
NS_ratio_a2 = get_ratio(NS_a2_adjusted)
print(NS_ratio_a2) # type-token ration in Phase 2
NS_ratio_a3 = get_ratio(NS_a3_adjusted)
print(NS_ratio_a3) # type-token ration in Phase 3

0.6818181818181818
0.703125
0.6833333333333333


In [15]:
# adjust the L2_a1 list. Remove the unnecessary words to get the W + a construction
L2_a1_adjusted = ['下雨啊', '长啊', '的啊', '天气啊', '问题啊', '天啊', '是啊', '知道啊', '开啊', '对啊', '好啊', '名字啊', 
                  '好啊', '厉害啊', '会啊', '要啊', '你好啊', '是啊', '万圣节啊', '高兴啊', '哪里啊', '是啊', '是啊', '是啊', 
                  '长时间啊', '是啊', '春季啊', '春假啊', '是啊', '是啊', '认识你啊', '是啊', '对啊', '厉害啊', '的啊',
                  '有气氛啊', '天啊', '谢谢啊', '有意思啊', '火鸡啊', '是啊']
L2_a2_adjusted = ['没事啊', '真的啊', '酷啊', '是啊', '是啊', '对啊', '穿啊', '对啊', '功课啊', '对啊', '是啊', '人啊', '好啊', 
                  '性啊', '忙啊', '怎么样啊', '漂亮啊', '对啊', '对啊', '对啊', '对啊', '是啊', '有趣啊', '是啊', 'cmu啊', '对啊', 
                  '对啊', '过啊', '蛋糕啊', '不错啊', '说啊', '不错啊', '你啊', '意思啊', '是啊', '对啊', '是啊']
L2_a3_adjusted = ['是啊', '对啊', '好啊', '对啊', '糖啊', '高兴啊', '是啊', '是啊', '是啊', '专业啊', '你啊', '对啊', '没有啊', 
                  '对啊', '是啊', '是啊', '是啊', '长啊', '对啊', '对啊', '对啊', '哪儿啊', '真的啊', '对啊', '对啊', '疯啊', 
                  '他们啊', '实习啊', '是啊', '不是啊', '对啊', '不是啊', '慕森啊', '是啊', '不错啊', '是啊', '对啊']
                    # removed 2 ‘啊啊啊’
len(L2_a1_adjusted),len(L2_a2_adjusted),len(L2_a3_adjusted)

(41, 37, 37)

In [16]:
# L2 Type-token ratio of a in each phase
L2_ratio_a1 = get_ratio(L2_a1_adjusted)
print(L2_ratio_a1) # type-token ration in Phase 1
L2_ratio_a2 = get_ratio(L2_a2_adjusted)
print(L2_ratio_a2) # type-token ration in Phase 2
L2_ratio_a3 = get_ratio(L2_a3_adjusted)
print(L2_ratio_a3) # type-token ration in Phase 3

0.6585365853658537
0.5675675675675675
0.4594594594594595


# W + ba

In [17]:
# Get a list of W + ba from the NS dataset in each phase
NS_ba1=get_ba_NS(path_wk1)+get_ba_NS(path_wk2) # Phase 1
NS_ba2=get_ba_NS(path_wk3)+get_ba_NS(path_wk4) # Phase 2 
NS_ba3=get_ba_NS(path_wk5)+get_ba_NS(path_wk6) # Phase 3 

# Get a list of W + ba from the L2 dataset in each phase
L2_ba1=get_ba_L2(path_wk1)+get_ba_L2(path_wk2)
L2_ba2=get_ba_L2(path_wk3)+get_ba_L2(path_wk4)
L2_ba3=get_ba_L2(path_wk5)+get_ba_L2(path_wk6)

In [18]:
NS_ba1_adjusted = ['聊天吧', '的吧', '还好吧', '还好吧', '主题吧', '300km/h吧', '新年吧', '聊吧', '算大吧', '城市吧', '去吧', 
                   '久吧', '多吧', '好吧', '了吧', 'night吧', '小时吧', '春节吧', '小时吧', '什么吧', '开始吧', '懒觉吧', 
                   '的店吧', '酒店吧', '了吧', '有吧', '春假吧', '作业吧', '这里吧', '还行吧', '不会吧', '聊吧', '不够吧', 
                   '火鸡吧', '这里吧', '不错吧', '感觉吧'] # removed '奔跑吧', a TV show name
NS_ba2_adjusted = ['匹兹堡吧', '远吧', '暑假吧', '暑假吧', '感恩节吧', '聊吧', '好吧', '庆祝吧', '主题吧', '是吧', '主题吧', 
                   '放假吧', '跨年吧', '没错吧', '还行吧', '了吧', '成都吧', '了吧', '看看吧', '到吧', '几天吧', 'Alto吧', 
                   '回家吧', '万圣节吧', '点吧', '节日吧', '多吧', 'Minor吧', '购物吧', '聊吧', '种类吧', '礼物吧', '电影吧', 
                   '还行吧']
NS_ba3_adjusted = ['多吧', '忙吧', '计划吧', '好吧', '春假吧', '联系吧', '新年吧', '取暖吧', '时候吧', '对吧', '橄榄球吧', 
                   '老师吧', '不高吧', '难吧', '两周吧', '对吧', '万圣节吧', '了吧', '忙吧', '还行吧', '人吧', '过吧', '聊吧', 
                   '圣诞吧', '一样吧', '鱼吧', '的吧', '还好吧', '一趟吧', '聊吧', '多吧', '这里吧']

In [19]:
# NS Type-token ratio of Ba in each phase
NS_ratio_ba1 = get_ratio(NS_ba1_adjusted)
print(NS_ratio_ba1) 
NS_ratio_ba2 = get_ratio(NS_ba2_adjusted)
print(NS_ratio_ba2)
NS_ratio_ba3 = get_ratio(NS_ba3_adjusted)
print(NS_ratio_ba3) 

0.8648648648648649
0.8529411764705882
0.875


In [20]:
L2_ba1_adjusted = ['的吧', '聊吧', '新年吧', '聊吧', '久吧', '学生吧', '买吧', 'bingchen吧', '电影吧', '狗吧', 'man吧', '对吧']
L2_ba2_adjusted = ['东北吧', '东北吧', '好吧', '感觉吧', '欢车吧', '推荐吧']
L2_ba3_adjusted = ['这样吧', '对吧', '感恩节吧', '这样吧', '人吧', '过吧', '匹兹堡吧']
len(L2_ba1_adjusted), len(L2_ba2_adjusted), len(L2_ba3_adjusted)

(12, 6, 7)

In [21]:
# L2 Type-token ratio of ba in each phase
L2_ratio_ba1 = get_ratio(L2_ba1_adjusted)
print(L2_ratio_ba1)
L2_ratio_ba2 = get_ratio(L2_ba2_adjusted)
print(L2_ratio_ba2) 
L2_ratio_ba3 = get_ratio(L2_ba3_adjusted)
print(L2_ratio_ba3)

0.9166666666666666
0.8333333333333334
0.8571428571428571


# W + ne

In [22]:
# Get a list of W + ne from the NS dataset in each phase
NS_ne1=get_ne_NS(path_wk1)+get_ne_NS(path_wk2) # Phase 1
NS_ne2=get_ne_NS(path_wk3)+get_ne_NS(path_wk4) # Phase 2 
NS_ne3=get_ne_NS(path_wk5)+get_ne_NS(path_wk6) # Phase 3 

# Get a list of W + ne from the L2 dataset in each phase
L2_ne1=get_ne_L2(path_wk1)+get_ne_L2(path_wk2)
L2_ne2=get_ne_L2(path_wk3)+get_ne_L2(path_wk4)
L2_ne3=get_ne_L2(path_wk5)+get_ne_L2(path_wk6)

In [23]:
NS_ne1_adjusted = ['你呢', '你呢', '不错呢', '盛大呢', '明白呢', '是呢', '的呢', '了呢', '了呢', '飞快呢', '的呢', '学校呢', 
                   '你呢', '工作呢', '的呢', '你呢', '时间呢', '新年呢', '什么呢', '是呢', '什么呢', '地方呢', '地方呢', 
                   '东西呢', 'CMU呢', '的呢', '什么呢', '开车呢', '你呢', '考呢', '多久呢', '你呢', '你呢', '你呢', '什么课呢', 
                   '中文呢', '哪儿呢', '什么呢', '你呢', '方向呢', '项目呢', '你呢', '你呢', '的呢', '吃法呢', '味道呢', '你呢',
                   '什么呢', '什么呢', '你呢', '哪里呢', '什么呢', '糖呢', '什么呢', '什么呢', '电影呢', '的呢', '你呢', '的呢',
                   '的呢', '好听呢', '的呢', '的呢', '什么呢', '错觉呢', '是呢', '不错呢', '你呢', '概念呢', '时代剧呢', 
                   '时候呢', '你呢', '啥呢', '事情呢', '了呢', '你呢', '人呢', '聚会呢', '糖呢', 'off呢', '喜欢呢', '你呢', 
                   '你呢', '你呢', '教呢', '几月呢', '什么呢', '了呢', '比如呢', '你呢', '哪里呢', '你呢', '感恩节呢', '没有呢', 
                   '的呢', '然后呢']

NS_ne2_adjusted = ['你呢', '暑假呢', '哪里呢', '做呢', '做呢', '的呢', '你呢', '节日呢', '比如呢', '节日呢', '你呢', '锅底呢', 
                   '菜呢', '者呢', '比如呢', '然后呢', '你呢', '你呢', '课呢', '你呢', '你呢', '的呢', '什么呢', '你呢', 
                   '出去呢', '什么呢', '电视呢', '什么呢', '新年呢', '你呢', '的呢', '什么呢', '的呢', '的呢', '计划呢', 
                   '考试呢', '你呢', '怎么样呢', '怎么样呢', '是呢', '烟花呢', '的呢', '不错呢', '确实呢', '的呢', '暑假呢', 
                   '干嘛呢', '生日呢', '的呢', '时间呢', '什么呢', '什么呢', '你呢', '的呢', '什么呢', '了呢', '的呢', '干嘛呢', 
                   '国家呢', '中文呢', '计划呢', '你呢', '什么呢', '是呢', '是呢', '什么呢', '是呢', '为什么呢', '是呢', 
                   '哪些呢', '礼物呢', '钱呢', '什么呢', '酒呢', '价格呢', '40刀呢', '的呢', '的呢', '过呢', '电影呢', '你呢', 
                   '你呢', '什么呢', '礼物呢', '什么呢', '什么呢', '什么呢', '了呢', '人呢', '着呢', '多久呢', '晚会呢']

NS_ne3_adjusted = ['你呢', '年呢', '决定呢', '你呢', '你呢', '你呢', '什么呢', '哪儿呢', '州呢', '什么呢', '的呢', '菜呢', '菜呢',
                   '什么呢', '玩儿呢', '知道呢', '春假呢', '是呢', '人呢', '人呢', '的呢', '研究生呢', '匹兹堡呢', '你呢', '的呢', 
                   '的呢', '嗯呢', '庆祝呢', '什么呢', '棒呢', '烟花呢', '嗯呢', '你呢', '的呢', '觉得呢', '你呢', '什么呢', 
                   '专业呢', '的呢', '哪里呢', '哪里呢', '工作呢', '活动呢', '你呢', '什么呢', '了呢', '巧克力呢', '专业呢', 
                   '你呢', '什么呢', '方面呢', '时间呢', '工作呢', '然后呢', '哪里呢', '圣诞节呢', '过呢', '什么呢', '国家呢', 
                   '庆呢', '你呢', '你呢', '滑雪呢', '多久呢', '难怪呢', '中国呢', '家呢', '你呢', '不呢', '的呢', '嗯呢', '了呢', 
                   '匹兹堡呢', '你呢', '访问呢','的呢', '的呢', '你呢', '你呢', '知道呢', '过呢', '上学呢', '这样子呢', '哪些呢', 
                   '什么呢', '为什么呢', '的呢', '的呢', '礼物呢', '好呢', '冷呢']

In [24]:
# NS Type-token ratio of ne in each phase
NS_ratio_ne1 = get_ratio(NS_ne1_adjusted)
print(NS_ratio_ne1) 
NS_ratio_ne2 = get_ratio(NS_ne2_adjusted)
print(NS_ratio_ne2)
NS_ratio_ne3 = get_ratio(NS_ne3_adjusted)
print(NS_ratio_ne3)

0.4791666666666667
0.45652173913043476
0.5164835164835165


In [25]:
L2_ne1_adjusted = ['你呢', '你呢', '你呢', '你呢', '你呢', '你呢', '多呢', '多久呢', '觉得呢', '你呢', '你呢', '什么呢', '你呢', 
                   '你呢', '你们呢', '年级呢', '土豆泥呢', '你呢', '你呢', '吃呢', '到呢', '你呢', '你呢', '你呢', '你呢', 
                   '你呢', '你呢', '你呢', '开始呢', '什么呢', '你呢', '你呢', '你呢', '你呢', '你呢', '春假呢', '你呢']
L2_ne2_adjusted = ['工作呢', '的呢', '你呢', '你呢', '实习呢', '你呢', '你呢', '你呢', '你呢', '你呢', '你呢', '什么呢', 
                   '你呢', '你呢', '觉得呢', '你呢', '你呢', '你呢', '你呢', '你呢', '你呢', '你呢', '你呢', '干嘛呢', '你呢', 
                   '你呢', '元旦呢', '为什么呢', '你呢', '上学呢', '你呢', '你呢', '你呢', '知道呢', '你呢', '你呢']
L2_ne3_adjusted = ['你呢', '个呢', '你呢', '了呢', '你呢', '你呢', '你呢', '你呢', '你呢', '的呢', '你呢', '你呢', '你呢', 
                   '在呢', '停留呢', '你呢', '你呢', '你呢', '你呢', '度假呢', '你呢', '你呢', '你呢', '那儿呢', '你呢', 
                   '过呢', '什么呢', '你呢', '你呢', '你呢', '现在呢']
len(L2_ne1_adjusted), len(L2_ne2_adjusted), len(L2_ne3_adjusted)

(37, 36, 31)

In [26]:
# L2 Type-token ratio of ne in each phase
L2_ratio_ne1 = get_ratio(L2_ne1_adjusted)
print(L2_ratio_ne1) 
L2_ratio_ne2 = get_ratio(L2_ne2_adjusted)
print(L2_ratio_ne2)
L2_ratio_ne3 = get_ratio(L2_ne3_adjusted)
print(L2_ratio_ne3) 

0.32432432432432434
0.3055555555555556
0.3548387096774194


# Contingency Analysis

In [27]:
#1 NS dataset: Get all the W + SFP constructions (all SFPs all phases in one object)
#2 NS dataset: Creat a dictionary with W + SFP as the key and its frequency as the value 
                # -> for contingency calculation
#3 L2 datasets: Get a dictionary with W + SFP as the key and its frequency as the value 
                # for each SFP in each phase

# Step 1:
# All W + SFP constructions from the NS dataset

In [28]:
# get all the a, ba, and ne from the three phases
All_NS_a = NS_a1_adjusted + NS_a2_adjusted + NS_a3_adjusted
All_NS_ba = NS_ba1_adjusted + NS_ba2_adjusted + NS_ba3_adjusted
All_NS_ne = NS_ne1_adjusted + NS_ne2_adjusted + NS_ne3_adjusted

# Token frequency of W + a, ba, and ne
print(len(All_NS_a), len(All_NS_ba), len(All_NS_ne))

# get all the SFPs used by the NS in the control group
All_NS_SFPs = All_NS_a + All_NS_ba + All_NS_ne
# print(sorted(All_NS_SFPs))

190 103 279


# Step 2: 
# Type frequency of W + SFP in the NS dataset

In [29]:
def getContruction_dic(a_list):
    # takes a list as input and returns a dictionary with list elements as the key 
    # and the elements' frequency as values
    result_dic = {}
    for i in a_list:
        if i in result_dic:
            result_dic[i]+=1
        else:
            result_dic[i]=1
    return result_dic

In [30]:
# get type frequency of W + SFP
All_NS_SFPs_dic = getContruction_dic(All_NS_SFPs)
for i in sorted(All_NS_SFPs_dic.keys()):
    print("%s:%s"% (i, All_NS_SFPs_dic[i])) # formatting: W + SFP: frequency

300km/h吧:1
40刀呢:1
Alto吧:1
CMU呢:1
Minor吧:1
night吧:1
off呢:1
一样吧:1
一趟吧:1
万圣节吧:2
上好啊:1
上学呢:1
不会吧:1
不同啊:1
不呢:1
不够吧:1
不少啊:1
不是啊:1
不给啊:1
不错吧:1
不错呢:3
不错啊:2
不高吧:1
专业呢:2
东西呢:1
两周吧:1
中国呢:1
中文呢:2
中文啊:1
丰富啊:2
为什么呢:2
为啥啊:1
主题吧:3
久吧:1
了吧:5
了呢:8
了啊:5
事情呢:1
人吧:1
人呢:4
人啊:4
什么吧:1
什么呢:34
什么啊:3
什么样啊:1
什么课呢:1
价格呢:1
作业吧:1
作业啊:1
你呢:51
你啊:1
你好啊:7
做呢:2
做啊:1
公园啊:1
关系啊:1
决定呢:1
冷呢:1
几了啊:1
几天吧:1
几天啊:1
几月呢:1
出去呢:1
创意啊:1
到吧:1
加油啊:1
匹兹堡吧:1
匹兹堡呢:2
南京啊:1
厉害啊:1
去吧:1
去啊:1
取暖吧:1
可爱啊:2
吃法呢:1
味道呢:1
哪些呢:2
哪儿呢:2
哪啊:1
哪里呢:6
哪里啊:1
啥呢:1
啥啊:1
喜欢呢:1
嗯呢:3
回家吧:1
国家呢:2
圣诞吧:1
圣诞节呢:1
圣诞节啊:3
地方呢:2
地方啊:1
城市吧:1
多久呢:3
多久啊:2
多吧:4
多啊:1
好亏啊:1
好吧:3
好听呢:1
好呢:1
好啊:6
好奇啊:1
好棒啊:1
好玩啊:1
学啊:1
学校呢:1
学校啊:1
宝贵啊:1
实习啊:1
家呢:1
对吧:2
对啊:16
小时吧:2
尝尝啊:1
山珍啊:1
峡谷啊:1
州呢:1
工作呢:3
工作啊:1
巧克力呢:1
巴士啊:1
干嘛呢:2
干嘛啊:1
年呢:1
庆呢:1
庆祝吧:1
庆祝呢:1
庆祝啊:1
开始吧:1
开心啊:1
开车呢:1
张家界啊:1
很好啊:1
很棒啊:1
得妙啊:1
忙吧:2
快啊:1
怎么样呢:2
意思啊:1
感恩节吧:1
感恩节呢:1
感觉吧:1
感觉啊:1
懒觉吧:1
懵状啊:1
成都吧:1
我啊:1
我想啊:1
打算啊:1
搭配啊:1
放假吧:1
放假啊:1
故事啊:1
教呢:1
文献啊:1
新年吧:2
新年呢:2
新年啊:1
方便啊:1
方向呢:1
方面呢:1
旅游啊:1
早啊:1
时代剧呢:1
时候吧:1
时候呢:1
时候

# Step 3: 
# Frequency of W + SFP in the L2 dataset

In [31]:
# frequency of W + a in each phase
L2_a1_dic = getContruction_dic(L2_a1_adjusted)
#print(L2_a1_dic)
L2_a2_dic = getContruction_dic(L2_a2_adjusted)
print(L2_a2_dic)
L2_a3_dic = getContruction_dic(L2_a3_adjusted)
#print(L2_a3_dic)
(len(L2_a1_dic), len(L2_a2_dic), len(L2_a3_dic))

{'没事啊': 1, '真的啊': 1, '酷啊': 1, '是啊': 7, '对啊': 10, '穿啊': 1, '功课啊': 1, '人啊': 1, '好啊': 1, '性啊': 1, '忙啊': 1, '怎么样啊': 1, '漂亮啊': 1, '有趣啊': 1, 'cmu啊': 1, '过啊': 1, '蛋糕啊': 1, '不错啊': 2, '说啊': 1, '你啊': 1, '意思啊': 1}


(27, 21, 17)

In [32]:
# frequency of W + ba in each phase
L2_ba1_dic = getContruction_dic(L2_ba1_adjusted)
print(L2_ba1_dic)
L2_ba2_dic = getContruction_dic(L2_ba2_adjusted)
print(L2_ba2_dic)
L2_ba3_dic = getContruction_dic(L2_ba3_adjusted)
print(L2_ba3_dic)
(len(L2_ba1_dic), len(L2_ba2_dic), len(L2_ba3_dic))

{'的吧': 1, '聊吧': 2, '新年吧': 1, '久吧': 1, '学生吧': 1, '买吧': 1, 'bingchen吧': 1, '电影吧': 1, '狗吧': 1, 'man吧': 1, '对吧': 1}
{'东北吧': 2, '好吧': 1, '感觉吧': 1, '欢车吧': 1, '推荐吧': 1}
{'这样吧': 2, '对吧': 1, '感恩节吧': 1, '人吧': 1, '过吧': 1, '匹兹堡吧': 1}


(11, 5, 6)

In [33]:
# frequency of W + ne in each phase
L2_ne1_dic = getContruction_dic(L2_ne1_adjusted)
#print(L2_ne1_dic)
L2_ne2_dic = getContruction_dic(L2_ne2_adjusted)
#print(L2_ne2_dic)
L2_ne3_dic = getContruction_dic(L2_ne3_adjusted)
print(L2_ne3_dic)
(len(L2_ne1_dic), len(L2_ne2_dic), len(L2_ne3_dic))

{'你呢': 21, '个呢': 1, '了呢': 1, '的呢': 1, '在呢': 1, '停留呢': 1, '度假呢': 1, '那儿呢': 1, '过呢': 1, '什么呢': 1, '现在呢': 1}


(12, 11, 11)

# Correlation: contingency and frequency

In [34]:
f_location = '/Users/qiongli/Dropbox/Dissertation/Data Analysis/'
f_name = 'Analysis_Contingency.xlsx'
# Experimental group
exp_cont_a = pd.read_excel(f_location+f_name, sheetname = 'exp_a')
exp_cont_ba = pd.read_excel(f_location+f_name, sheetname = 'exp_ba')
exp_cont_ne = pd.read_excel(f_location+f_name, sheetname = 'exp_ne')
exp_cont_all = pd.concat([exp_cont_a, exp_cont_ba, exp_cont_ne])
exp_cont_L2 = exp_cont_all[(exp_cont_all.L2_Phase1>0) | (exp_cont_all.L2_Phase2>0) | (exp_cont_all.L2_Phase3>0)]
exp_cont_L2 = exp_cont_L2.fillna(0)
exp_cont_L2.head()

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
1,知道啊:4,4,0,199,396,0.665546,0.0,0.0,1.0
3,意思啊:2,2,0,201,396,0.663317,0.0,1.0,4.0
4,有意思啊:2,2,0,201,396,0.663317,0.0,1.0,0.0
18,厉害啊:1,1,0,202,396,0.662207,0.0,2.0,1.0
34,有啊:1,1,0,202,396,0.662207,0.0,1.0,0.0


In [35]:
from scipy.stats import pearsonr
e1 = c1=scipy.stats.kendalltau(exp_cont_L2['Contingency'], exp_cont_L2['L2_Phase1'])
e2 = c1=scipy.stats.kendalltau(exp_cont_L2['Contingency'], exp_cont_L2['L2_Phase2'])
e3 = c1=scipy.stats.kendalltau(exp_cont_L2['Contingency'], exp_cont_L2['L2_Phase3'])
e1, e2, e3

(KendalltauResult(correlation=0.036228850689900498, pvalue=0.635834662947383),
 KendalltauResult(correlation=0.17515252321019242, pvalue=0.018878364770336699),
 KendalltauResult(correlation=0.18063803764221553, pvalue=0.016796424939455101))

In [48]:
# Control group
ctrl_cont_a = pd.read_excel(f_location+f_name, sheetname = 'ctrl_a')
ctrl_cont_ba = pd.read_excel(f_location+f_name, sheetname = 'ctrl_ba')
ctrl_cont_ne = pd.read_excel(f_location+f_name, sheetname = 'ctrl_ne')
#ctrl_cont_ne.head()
ctrl_cont_all = pd.concat([ctrl_cont_a, ctrl_cont_ba, ctrl_cont_ne])
ctrl_cont_L2 = ctrl_cont_all[(ctrl_cont_all.L2_Phase1>0) | (ctrl_cont_all.L2_Phase2>0) | (ctrl_cont_all.L2_Phase3>0)]
ctrl_cont_L2 = ctrl_cont_L2.fillna(0)
ctrl_cont_L2.head()

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
0,你啊:4,4,51,186,331,-0.287041,1.0,1.0,1.0
1,的啊:3,3,36,187,346,-0.273921,2.0,0.0,0.0
3,哪里啊:1,1,6,189,376,-0.191656,1.0,0.0,0.0
13,怎么样啊:1,1,2,189,380,0.001172,0.0,1.0,0.0
17,不错啊:2,2,4,188,378,0.001178,0.0,2.0,1.0


In [49]:
c1 = c1=scipy.stats.kendalltau(ctrl_cont_L2['Contingency'], ctrl_cont_L2['L2_Phase1'])
c2 = c1=scipy.stats.kendalltau(ctrl_cont_L2['Contingency'], ctrl_cont_L2['L2_Phase2'])
c3 = c1=scipy.stats.kendalltau(ctrl_cont_L2['Contingency'], ctrl_cont_L2['L2_Phase3'])
c1, c2, c3

(KendalltauResult(correlation=0.035701228858820114, pvalue=0.69264053623297994),
 KendalltauResult(correlation=0.2574818480741547, pvalue=0.0043562776139502253),
 KendalltauResult(correlation=0.035701228858820114, pvalue=0.69264053623297994))

# Contingency in individual SFP

In [38]:
# get constructions shared by L2 and NS datasets
ctrl_a_shared = ctrl_cont_a[ctrl_cont_a.Contingency!=0].dropna(thresh=7)
ctrl_a_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
103,你好啊:7,7,0,183,382,0.676106,1.0,,
95,酷啊:1,1,0,189,382,0.669002,,1.0,
80,没事啊:1,1,0,189,382,0.669002,,1.0,
51,哪啊:1,1,0,189,382,0.669002,,,1.0
50,厉害啊:1,1,0,189,382,0.669002,2.0,,
41,不是啊:1,1,0,189,382,0.669002,,,2.0
38,对啊:16,16,2,174,380,0.574809,2.0,20.0,11.0
35,是啊:21,21,10,169,372,0.365035,10.0,7.0,10.0
32,好啊:8,8,7,182,375,0.206583,2.0,1.0,
28,火鸡啊:1,1,1,189,381,0.168421,1.0,,


In [39]:
ctrl_ba_shared = ctrl_cont_ba[ctrl_cont_ba.Contingency!=0].dropna(thresh=7)
ctrl_ba_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
12,久吧:1,1,0,102,469,0.821366,1.0,,
58,聊吧:6,6,1,97,468,0.685461,2.0,,
35,感恩节吧:1,1,1,102,468,0.321053,,,1.0
36,感觉吧:1,1,1,102,468,0.321053,,1.0,
28,好吧:6,6,9,97,460,0.225853,,1.0,
40,新年吧:2,2,3,101,466,0.221869,1.0,,
19,匹兹堡吧:1,1,2,102,467,0.154071,,,1.0
51,电影吧:1,1,3,102,466,0.070423,1.0,,
66,过吧:1,1,3,102,466,0.070423,,,1.0
15,人吧:1,1,8,102,461,-0.070061,,,1.0


In [40]:
ctrl_ne_shared = ctrl_cont_ne[ctrl_cont_ne.Contingency!=0].dropna(thresh=7)
ctrl_ne_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
10,为什么呢:2,2,0,277,293,0.514035,,1.0,
75,知道呢:2,2,0,277,293,0.514035,,1.0,
0,刀呢:1,1,0,278,293,0.513135,1.0,,
1,CMU呢:1,1,0,278,293,0.513135,,1.0,
86,觉得呢:1,1,0,278,293,0.513135,1.0,1.0,
15,你呢:51,51,4,228,289,0.486267,25.0,26.0,21.0
11,什么呢: 34,34,4,245,289,0.435935,2.0,1.0,1.0
72,的呢:34,34,5,245,288,0.412133,,1.0,1.0
38,工作呢:3,3,1,276,292,0.264085,,1.0,
90,过呢:3,3,1,276,292,0.264085,,,1.0


In [41]:
exp_a_shared = exp_cont_a[exp_cont_a.Contingency!=0].dropna(thresh=7)
exp_a_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
1,知道啊:4,4,0,199,396,0.665546,,,1.0
4,有意思啊:2,2,0,201,396,0.663317,,1.0,
3,意思啊:2,2,0,201,396,0.663317,,1.0,4.0
18,厉害啊:1,1,0,202,396,0.662207,,2.0,1.0
34,有啊:1,1,0,202,396,0.662207,,1.0,
36,有趣啊:1,1,0,202,396,0.662207,1.0,1.0,
45,火锅啊:1,1,0,202,396,0.662207,,,1.0
48,熊猫啊:1,1,0,202,396,0.662207,,1.0,
70,难啊:1,1,0,202,396,0.662207,1.0,4.0,
73,这样啊:16,16,2,187,394,0.56703,1.0,,2.0


In [42]:
exp_ba_shared = exp_cont_ba[exp_cont_ba.Contingency!=0].dropna(thresh=7)
exp_ba_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
0,这里吧:4,4,0,123,472,0.793277,,1.0,1.0
61,个吧:2,2,0,125,472,0.79062,,,1.0
15,觉吧:1,1,0,126,472,0.789298,,1.0,1.0
45,回家吧:1,1,0,126,472,0.789298,,1.0,
70,暑假吧:1,1,0,126,472,0.789298,,2.0,
21,聊吧:4,4,2,123,470,0.459247,,1.0,
73,新年吧:4,4,2,123,470,0.459247,,,1.0
63,忙吧:2,2,1,125,471,0.456935,,1.0,
69,春假吧:5,5,3,122,469,0.41857,,1.0,
33,对吧:5,5,7,122,465,0.20883,1.0,,1.0


In [43]:
exp_ne_shared = exp_cont_ne[exp_cont_ne.Contingency!=0].dropna(thresh=7)
exp_ne_shared.sort_values(by='Contingency', ascending=False)

Unnamed: 0,NS_SFPs,a,b,c,d,Contingency,L2_Phase1,L2_Phase2,L2_Phase3
18,你呢:66,66,0,203,330,0.619137,33.0,39.0,38.0
8,为什么呢:3,3,0,266,330,0.553691,,2.0,
17,你们呢:2,2,0,267,330,0.552764,,,1.0
5,不知道呢:1,1,0,268,330,0.551839,1.0,,1.0
68,然后呢:1,1,0,268,330,0.551839,,1.0,
90,过呢:5,5,1,264,329,0.388139,,,1.0
14,什么呢:36,36,11,233,319,0.343856,3.0,5.0,3.0
9,了呢:12,12,6,257,324,0.224326,,1.0,
55,怎么样呢:2,2,1,267,329,0.21868,1.0,2.0,1.0
88,课呢:2,2,1,267,329,0.21868,,1.0,
