In [94]:
import numpy as np
import pandas as pd

In [95]:
raw_data = pd.read_csv("email_sample/data.csv", sep=",", header=None)
raw_data.columns = ["ID", "Topic", "Email", "Count", "User"]

In [96]:
raw_data.head()

Unnamed: 0,ID,Topic,Email,Count,User
0,156296,incontact,382805,3.0,EastNearbyCrumpton5243
1,156297,c16,382805,1.0,EastNearbyCrumpton5243
2,136241,blog,382805,1.0,EastNearbyCrumpton5243
3,156226,2017,382805,1.0,EastNearbyCrumpton5243
4,143483,june 1st,382812,1.0,EastNearbyCrumpton5243


In [97]:
data = raw_data.loc[:, 'ID':'Count']
data.head()

Unnamed: 0,ID,Topic,Email,Count
0,156296,incontact,382805,3.0
1,156297,c16,382805,1.0
2,136241,blog,382805,1.0
3,156226,2017,382805,1.0
4,143483,june 1st,382812,1.0


In [98]:
topicsCount=data['ID'].value_counts().sort_index()
topicsCount = topicsCount.reset_index()
topicsCount.columns = ["ID", "Count"]
topicsCount.head()

Unnamed: 0,ID,Count
0,136086,8
1,136091,20
2,136095,6
3,136100,5
4,136103,1


In [99]:
topicsCount['Topic'] = "Some Topic"

In [100]:
topicsCount.head()

Unnamed: 0,ID,Count,Topic
0,136086,8,Some Topic
1,136091,20,Some Topic
2,136095,6,Some Topic
3,136100,5,Some Topic
4,136103,1,Some Topic


In [101]:
unwantedTopics = topicsCount[topicsCount['Count'] <= 5]

In [102]:
unwantedTopics.head()

Unnamed: 0,ID,Count,Topic
3,136100,5,Some Topic
4,136103,1,Some Topic
5,136104,2,Some Topic
6,136111,4,Some Topic
9,136118,1,Some Topic


In [103]:
usefulData = data[~data.ID.isin(unwantedTopics.ID)]
usefulData.head()

Unnamed: 0,ID,Topic,Email,Count
0,156296,incontact,382805,3.0
2,136241,blog,382805,1.0
3,156226,2017,382805,1.0
4,143483,june 1st,382812,1.0
6,156296,incontact,382812,5.0


In [104]:
usefulTopicsCount = topicsCount[topicsCount['Count'] >5 ]
usefulTopicsCount.head()

Unnamed: 0,ID,Count,Topic
0,136086,8,Some Topic
1,136091,20,Some Topic
2,136095,6,Some Topic
7,136113,10,Some Topic
8,136114,46,Some Topic


In [105]:
usefulTopicsCount.to_csv('topics.csv')

In [106]:
usefulTopics = pd.merge(usefulData, usefulData, left_on="Email", right_on="Email")
usefulTopics.columns = ["ID1", "Topic1", "Email", "Count1", "ID2", "Topic2", "Count2"]

In [107]:
usefulTopics.head()

Unnamed: 0,ID1,Topic1,Email,Count1,ID2,Topic2,Count2
0,156296,incontact,382805,3.0,156296,incontact,3.0
1,156296,incontact,382805,3.0,136241,blog,1.0
2,156296,incontact,382805,3.0,156226,2017,1.0
3,136241,blog,382805,1.0,156296,incontact,3.0
4,136241,blog,382805,1.0,136241,blog,1.0


In [108]:
usefulTopics['Count'] = np.where(usefulTopics['ID1'] == usefulTopics['ID2'], usefulTopics['Count1'],usefulTopics['Count1']*usefulTopics['Count2'])

In [109]:
usefulTopics.head()

Unnamed: 0,ID1,Topic1,Email,Count1,ID2,Topic2,Count2,Count
0,156296,incontact,382805,3.0,156296,incontact,3.0,3.0
1,156296,incontact,382805,3.0,136241,blog,1.0,3.0
2,156296,incontact,382805,3.0,156226,2017,1.0,3.0
3,136241,blog,382805,1.0,156296,incontact,3.0,3.0
4,136241,blog,382805,1.0,136241,blog,1.0,1.0


In [110]:
usefulTopics = usefulTopics.groupby(['ID1','Topic1','ID2','Topic2'])[['Count']].sum()
usefulTopics = usefulTopics.reset_index()

In [115]:
usefulTopics.head()

Unnamed: 0,ID1,Topic1,ID2,Topic2,Count
0,136086,ror,136086,ror,17.0
1,136086,ror,136113,ssh,1.0
2,136086,ror,136162,css,24.0
3,136086,ror,136180,startups,12.0
4,136086,ror,136181,having,1.0


In [116]:
usefulTopics = usefulTopics[usefulTopics['Count'] > 5]

In [135]:
usefulTopics.head()

Unnamed: 0,ID1,Topic1,ID2,Topic2,Count
0,136086,ror,136086,ror,17.0
2,136086,ror,136162,css,24.0
3,136086,ror,136180,startups,12.0
5,136086,ror,136233,javascript,24.0
9,136086,ror,136352,i'd,12.0


In [118]:
usefulTopics.to_csv('mutal_ropics.csv')

In [119]:
topicIds = usefulTopicsCount['ID']

In [120]:
dataset = pd.DataFrame(np.zeros((len(topicIds), len(topicIds))))

In [121]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,341,342,343,344,345,346,347,348,349,350
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
dataset.columns = topicIds

In [123]:
dataset.index = topicIds

In [133]:
dataset.head()

ID,136086,136091,136095,136113,136114,136119,136142,136145,136151,136162,...,158299,158334,158462,158471,158544,158779,158780,158792,158793,158794
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
136086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
for i in usefulTopics.index:
    id1 = usefulTopics.loc[i, 'ID1']
    id2 = usefulTopics.loc[i, 'ID2']
    #print(id1,id2)
    #break
    dataset.loc[id1,id2] = np.where(usefulTopics.loc[i,'ID1'] == id1 and usefulTopics.loc[i,'ID2'], usefulTopics.loc[i, 'Count'], 0)
    dataset.loc[id2,id1] = dataset.loc[id1,id2]

In [170]:
dataset.head()

ID,136086,136091,136095,136113,136114,136119,136142,136145,136151,136162,...,158299,158334,158462,158471,158544,158779,158780,158792,158793,158794
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
136086,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,...,0.0,16.0,0.0,0.0,0.0,0.0,0.0,96.0,10.0,0.0
136091,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,10.0,56.0,0.0,0.0
136095,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136113,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15.0,30.0,8.0,0.0
136114,0.0,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,...,0.0,35.0,10.0,0.0,0.0,0.0,10.0,193.0,0.0,0.0
