# NLP Hackathon

### Package import handling

In [1]:
import pandas as pd
import numpy as np
import glob
import shutil

### Preprocessing 

##### Combining all data files into one

In [2]:
combineFile = 'data.txt'

with open(combineFile,'wb') as wf:
    for filename in glob.glob('data_files/*.txt'):
        if filename == combineFile:
            continue
        with open(filename,'rb') as rf:
            shutil.copyfileobj(rf,wf)

In [3]:
with open('data.txt',encoding = 'unicode_escape') as file:
    data = file.read()

##### Regexing all user roles and stories

In [4]:
import regex as re 

In [5]:
# removing empty newlines
regex = r"(\n)\n+"
subst = "\\n"

data = re.sub(regex, subst, data, 0, re.MULTILINE)


### Solutions

##### 1 ) Automatic generation user stories for different stakeholders /roles

a. Role based Stories extraction can be done using regular expression

b. The different stakeholders involved in the corpus are data manager, PI, IT manager, ITstaff member, researcher, repository manager etc.

In [6]:
regex = r"As\s[a-z]+\s([^,]+),\s(.*)[\n]"

matches = re.finditer(regex, data, re.MULTILINE)

data_list = []

for matchNum, match in enumerate(matches, start=1):    
    #print(match.group(1),match.group(2))
    data_list.append([match.group(1).lower(),match.group(2).lower()])

data_list

[['data user', 'i want to have the 12-19-2017 deletions processed.'],
 ['ui designer',
  'i want to redesign the resources page, so that it matches the new broker design styles.'],
 ['ui designer',
  'i want to report to the agencies about user testing, so that they are aware of their contributions to making broker a better ux.'],
 ['ui designer',
  'i want to move on to round 2 of dabs or fabs landing page edits, so that i can get approvals from leadership.'],
 ['ui designer',
  'i want to move on to round 2 of homepage edits, so that i can get approvals from leadership.'],
 ['ui designer',
  'i want to move on to round 3 of the help page edits, so that i can get approvals from leadership.'],
 ['developer ',
  'i want to be able to log better, so that i can troubleshoot issues with particular submissions and functions.'],
 ['developer',
  'i want to add the updates on a fabs submission to be modified when the publishstatus changes, so that i know when the status of the submission has 

In [7]:
#converting data into dataframe
df = pd.DataFrame(columns=['user_role','user_story'],data=data_list)

df.head()

Unnamed: 0,user_role,user_story
0,data user,i want to have the 12-19-2017 deletions proces...
1,ui designer,"i want to redesign the resources page, so that..."
2,ui designer,i want to report to the agencies about user te...
3,ui designer,i want to move on to round 2 of dabs or fabs l...
4,ui designer,i want to move on to round 2 of homepage edits...


##### 2. Identify the tasks for given stakeholder


In [8]:
df['user_role'].value_counts()

user                                                   149
researcher                                             111
olderperson                                             95
archivist                                               89
developer                                               61
                                                      ... 
mike                                                     1
advertiser                                               1
site member who has read a teaser on the front page      1
technical staff member                                   1
research centre director                                 1
Name: user_role, Length: 208, dtype: int64

a. Data management 

In [9]:
df_data_management = df[df['user_story'].str.contains('data management')]

df_data_management

Unnamed: 0,user_role,user_story
714,data contributor,i want to refer to the disaster recovery plan ...
1124,funder,i want to be able to read the costs for data m...
1126,stakeholder,i want to know when the data management plan w...
1131,data manager,i want to know the time plan for collecting da...
1176,faculty data steward,"i want to see the sections on costing, so that..."
1177,faculty data steward,"i want to see the sections on roles, so that i..."
1178,faculty data steward,i want to see the sections on responsibilities...
1258,depositor,i want to allow others to deposit on my behalf...


b. Data store 

In [10]:
df_data_store = df[df['user_story'].str.contains('data store')]

df_data_store

Unnamed: 0,user_role,user_story
306,civic tech activist,i want to make it easy to assess the quality o...
928,olderperson,i want to have my data stored directly on the ...
1249,depositor,i want to link to data stored in external repo...


c. analysis 

In [11]:
df_analysis = df[df['user_story'].str.contains('analysis')]

df_analysis

Unnamed: 0,user_role,user_story
257,platform administrator,i want to be able to translate the data types ...
258,researcher,i want to get a data package into julia in sec...
260,publisher,i want to be able to provide a visualization o...
264,researcher,i want to get a data package into r in seconds...
265,researcher,i want to get a data package into excel in sec...
266,researcher,i want to get a data package into spss in seco...
267,researcher,i want to get a data package into stata in sec...
269,researcher,i want to get a data package into libreoffice/...
270,developer,i want to get a data package into python in se...
273,developer,i want to do exploratory data analysis in r an...


d. cleaning

In [12]:
df_cleaning = df[df['user_story'].str.contains('cleaning')]

df_cleaning

Unnamed: 0,user_role,user_story


e. privacy

In [13]:
df_privacy = df[df['user_story'].str.contains('privacy')]

df_privacy

Unnamed: 0,user_role,user_story
899,olderperson,i want to be able to manage my privacy and dat...
1044,trainer,i want to have a view that is attached to thei...
1172,institutional data manager,i want to know about the privacy and security ...
1181,data manager,"i want to plan the anonymization of data, so t..."


##### 3. Visualization of frequently occurred words based on stakeholders

In [2]:
import matplotlib 
from wordcloud import WordCloud

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
text = " ".join(cat.split()[1] for cat in df.category)
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)