# Feature Extraction

In [1]:
import pandas
import json
import re
from emoji import UNICODE_EMOJI
import emojis

# Data Loading

In [2]:
feedsDict = {}
labelsDict = {}
dataFrame = pandas.DataFrame(columns=['ID','Tweets','Gender','Occupation','Fame','Birthyear'])

## Reading Tweets

In [3]:
with open('dataset/testData/'+ 'tweets.ndjson', encoding='utf8') as feeds:
    print('<!---------------------Feeds opened successfully--------------------!>')
    json_data = json.load(feeds)
    dataFrame['ID'] = json_data.keys()
    dataFrame.set_index('ID',inplace=True)
    for data in json_data:
        dataFrame.loc[data]['Tweets'] = json_data[data]
print('<!---------------------Feeds read successfully----------------------!>\n')

<!---------------------Feeds opened successfully--------------------!>
<!---------------------Feeds read successfully----------------------!>



## Reading Labels

In [4]:
label_data = pandas.read_excel('dataset/testData/' + 'labels.xlsx', index_col='username')

for data in dataFrame.index.values:
    if data in label_data.index.values:
        dataFrame.loc[data]['Birthyear'] = label_data.loc[data]['Birthyear'][1:5] if type(label_data.loc[data]['Birthyear'])== str else label_data.loc[data]['Birthyear']
        dataFrame.loc[data]['Occupation'] = label_data.loc[data]['Occupation'].split()[0]
        dataFrame.loc[data]['Gender'] = label_data.loc[data]['Gender'].split()[0]
dataFrame['Birthyear'] = dataFrame['Birthyear'].astype(int)
dataFrame['Birthyear'] = pandas.cut(dataFrame['Birthyear'], bins=[1900, 1954, 1969, 1984, 1994, 2006],
                                    labels=['65+', '50-64', '35-49', '25-34', '12-24'])
print('<!--------------------Labels opened successfully--------------------!>\n')
print(dataFrame)

<!--------------------Labels opened successfully--------------------!>

                                                            Tweets  Gender  \
ID                                                                           
najamsethi       It’s @ImranKhanPTI’s birthday. He turned 67 to...    male   
nighatdad        Absolutely! </tweet>For A, it was purely an ec...  female   
peaceforchange   Stay blessed </tweet>Stay blessed </tweet>Stay...    male   
realshoaibmalik  Happy birthday Ahmad 🎂🎂🎂 </tweet>Congrats! </t...    male   
sanabucha        مسئلہ یہ نہیں کہ مولانا سیاست میں مذہب کا استع...  female   
sanammodysaeed   Why are films like #Durj Banned. If Hollywood ...  female   
shoaib100mph     It is wonderful to have cricket back in Pakist...    male   
sohaibcricketer  Great batting aamir best batsman among the bow...    male   
vennelakishore   🙏🙏🙏 </tweet>Kumbalangi 🤩 </tweet>SHAMI..AGAINN...    male   
waqyounis99      “Long Live Skipper” Today is a special day for...    

# Feature Extraction

In [5]:
features = pandas.DataFrame(index=dataFrame.index)

In [6]:
# features['likes']
# features['media']
# features['followers']
# features['following']
# features['joining date']
# features['total tweets']
# features['private']

features['happy_emoji'] = dataFrame['Tweets'].apply(lambda x:len(re.findall('😀|😃|😄|😁|😆|🤣|😂',x)))
features['sad_emoji'] = dataFrame['Tweets'].apply(lambda x:len(re.findall('😕|😟|🙁|☹|😯|🥺|😦|😧|😨|😰|😥|😢|😭|😖|😣|😞',x)))
features['emoji_count'] = dataFrame['Tweets'].apply(lambda x:len([emoji for emoji in x if emoji in UNICODE_EMOJI]))
features['URL_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall(r"(http|https)\S*",x)))
features['@_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('@',x)))
features['#_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('#',x)))
features['?_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('\?',x)))
features['"_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('"',x)))
features["'_count"] = dataFrame['Tweets'].apply(lambda x: len(re.findall("'",x)))
features[' _count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall(' ',x)))
features['!_conut'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('!',x)))
features['._count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall('.',x)))
features[',_count'] = dataFrame['Tweets'].apply(lambda x: len(re.findall(',',x)))
features['special_chars'] = dataFrame.Tweets.apply(lambda x: len(re.findall('[!|@|#|$|&|*|_| ]',x)))
features['total_tweets'] = dataFrame.Tweets.apply(lambda x:len(re.findall(' </tweet>',x)))

dataFrame.Tweets = dataFrame.Tweets.str.replace(' </tweet>','')

In [7]:
features['AVG Words'] = round(features[' _count']/features['total_tweets'])
dataFrame.Tweets = dataFrame.Tweets.str.replace(' ','')

In [17]:
features['total_Chars'] = dataFrame.Tweets.apply(lambda x:len(re.findall(r'[\w\s]',x)))
features['AVG Chars'] = round(features['total_Chars']/features['total_tweets'])
features.drop(['total_Chars','total_tweets'],axis=1,inplace=True)

In [18]:
features

Unnamed: 0_level_0,happy_emoji,sad_emoji,emoji_count,URL_count,@_count,#_count,?_count,"""_count",'_count,_count,!_conut,._count,",_count",special_chars,AVG Words,AVG Chars
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
najamsethi,17,7,210,1359,5604,503,1142,607,972,86641,2896,633615,1746,97177,15.0,82.0
nighatdad,225,104,2068,9550,35316,24240,6661,3062,5427,527328,4825,3953120,10038,599366,14.0,79.0
peaceforchange,0,0,22,119,169,144,36,6,7,15484,85,112576,278,16068,20.0,107.0
realshoaibmalik,148,10,2337,446,3648,1683,554,31,837,62364,1480,446709,1832,70512,11.0,54.0
sanabucha,32,9,339,937,1517,778,807,598,839,48592,1874,357075,1131,53243,18.0,95.0
sanammodysaeed,1,0,298,705,463,2323,165,16,174,15883,703,144328,271,19852,13.0,87.0
shoaib100mph,16,4,88,153,525,1669,267,56,408,37169,491,246325,504,40393,18.0,89.0
sohaibcricketer,24,37,135,1,143,58,31,0,7,3623,2,25347,5,3854,9.0,42.0
vennelakishore,10814,137,31822,680,855,1169,428,264,206,69737,45,643327,372,72110,4.0,24.0
waqyounis99,6,0,194,57,478,701,67,42,115,12390,215,90617,305,13942,18.0,100.0
