In [None]:
import pandas as pd
from collections import Counter
import jieba
from tqdm import tqdm
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
df_submit = pd.read_csv("./sample_submission.csv")
df_submit.head()

In [None]:
df_train = pd.read_csv( "./train.csv" )

In [None]:
df_train.shape

In [None]:
df_train.loc[0, :]

In [None]:
Counter( df_train[ "label" ] )

In [None]:
df_test = pd.read_csv( "./test.csv" )
df_test.shape

In [None]:
df_test.loc[0, :]

In [None]:
df_train.loc[ 0, "title1_zh" ].replace(" ", "")

In [None]:
" ".join(jieba.cut(df_train.loc[ 0, "title1_zh" ], cut_all=False))

In [None]:
# df_train = df_train.loc[:2999, :]
# df_test = df_test.loc[:99, :]

In [None]:
df_train = df_train[ [ "id", "title1_zh", "title2_zh", "label" ] ]
df_test = df_test[ [ "id", "title1_zh", "title2_zh" ] ]

In [None]:
df_train = df_train.fillna("NaN")
df_test = df_test.fillna("NaN")

In [None]:
df_train["title1_zh"] = df_train["title1_zh"].map(lambda x: " ".join(jieba.cut(x, cut_all=False)))
df_train["title2_zh"] = df_train["title2_zh"].map(lambda x: " ".join(jieba.cut(x, cut_all=False)))
df_test["title1_zh"]  =  df_test["title1_zh"].map(lambda x: " ".join(jieba.cut(x, cut_all=False)))
df_test["title2_zh"]  =  df_test["title2_zh"].map(lambda x: " ".join(jieba.cut(x, cut_all=False)))

In [None]:
corpus = df_train["title1_zh"].tolist() + df_train["title2_zh"].tolist()\
        + df_test["title1_zh"].tolist() + df_test["title2_zh"].tolist()

In [None]:
vectorizer = TfidfVectorizer().fit(corpus)
# tfidf = vectorizer.transform(corpus)

In [None]:
df_train["title1_zh"] = df_train["title1_zh"].map(lambda x: vectorizer.transform([x])[0])
df_train["title2_zh"] = df_train["title2_zh"].map(lambda x: vectorizer.transform([x])[0])
df_test["title1_zh"]  =  df_test["title1_zh"].map(lambda x: vectorizer.transform([x])[0])
df_test["title2_zh"]  =  df_test["title2_zh"].map(lambda x: vectorizer.transform([x])[0])

In [None]:
df_train.loc[0, 'title2_zh']

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
df_train.loc[ 15, : ]

In [None]:
linear_kernel( df_train.loc[15, "title1_zh"], df_train.loc[15, "title2_zh"] ).flatten()[0]

In [None]:
# df_train.apply( lambda x: linear_kernel( x["title1_zh"], x["title2_zh"] ).flatten()[0], axis=1 )
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train['similarity'] = 0.0
df_test['similarity'] = 0.0

In [None]:
pbar = tqdm(total=len(df_train))

for idx, row in df_train.iterrows():
    sim = linear_kernel( row["title1_zh"], row["title2_zh"] ).flatten()[0]
    df_train.set_value(idx, "similarity", sim)
    pbar.update(1)
    

In [None]:
pbar = tqdm(total=len(df_test))

for idx, row in df_test.iterrows():
    sim = linear_kernel( row["title1_zh"], row["title2_zh"] ).flatten()[0]
    df_test.set_value(idx, "similarity", sim)
    pbar.update(1)
    

In [None]:
df_train.loc[ df_train['label'] == "agreed", : ]["similarity"].describe()

In [None]:
df_train.loc[ df_train['label'] == "disagreed", : ]["similarity"].describe()

In [None]:
df_train.loc[ df_train['label'] == "unrelated", : ]["similarity"].describe()

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

x1 = df_train.loc[ df_train['label'] == "disagreed", : ]["similarity"].tolist()
x2 = df_train.loc[ df_train['label'] == "agreed", : ]["similarity"].tolist()
x3 = df_train.loc[ df_train['label'] == "unrelated", : ]["similarity"].tolist()

trace = go.Histogram(x=x1, histnorm='probability',
                     name="disagreed",
                     xbins=dict(start=np.min(x1),
                                size=0.03,
                                end=np.max(x1)))

trace1 = go.Histogram(x=x2, histnorm='probability',
                     name="agreed",
                     xbins=dict(start=np.min(x2),
                                size=0.03,
                                end=np.max(x2)))

trace2 = go.Histogram(x=x3, histnorm='probability',
                     name="unrelated",
                     xbins=dict(start=np.min(x3),
                                size=0.03,
                                end=np.max(x3)))

layout = go.Layout(
    title="Similarity Distribution",
    xaxis=dict( 
        title="Similarity"
    ),
    yaxis=dict( 
        title="Ratio"
    )
)

fig = go.Figure(data=go.Data([trace, trace1, trace2]), layout=layout)
py.iplot(fig, filename='histogram-prob-dist')

In [None]:
df_submit = pd.DataFrame( columns=['Id', 'Category'] )

In [None]:
df_test['id'].head()

In [None]:
df_test['Category'] = ""

In [None]:
df_test_dup = pd.read_csv("./test.csv")

In [None]:
df_test['title2'] = df_test_dup['title2_zh']

In [None]:
df_test.head()

In [None]:
pbar = tqdm(total=len(df_test))

key_words = ["辟谣", "网警", "谣言", "勿信", "传谣", "假的"]

for idx, row in df_test.iterrows():
    lab = ""
    if row['similarity'] >= 0.25:
        count = 0
        for word in key_words:
            count += row['title2'].count( word )
        if count > 0:
            lab = "disagreed"
        else:
            lab = "agreed"
    else:
        lab = "unrelated"
    
    df_test.set_value(idx, "Category", lab)
    pbar.update(1)
    

In [None]:
from collections import Counter

Counter( df_test['Category'] )

In [None]:
df_submit['Id'] = df_test['id']
df_submit['Category'] = df_test['Category']

In [None]:
Counter(df_submit['Category'])

In [None]:
df_submit.to_csv("./submit.csv", index=False)