In [2]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

In [3]:
df = pd.DataFrame.from_csv('drugLibTrain_raw.tsv', sep='\t')

In [4]:
df

Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above
2372,propecia,1,Ineffective,Severe Side Effects,hair loss,"after taking propecia for over a year, startin...","Low sex drive, before i started taking this dr...",one pill once daily
1043,vyvanse,9,Highly Effective,Mild Side Effects,add,"My mood has noticably improved, I have more en...","a few experiences of nausiea, heavy moodswings...",I had began taking 20mg of Vyvanse for three m...
2715,elavil,10,Considerably Effective,No Side Effects,depression,Although this drug was originally prescribed f...,None.,"One a day, taken about an hour before bedtime."
1591,xanax,10,Highly Effective,No Side Effects,panic disorder,This simply just works fast and without any of...,I really don't have any side effects other tha...,I first started taking this at 3 times per day...
1686,claritin,1,Ineffective,Extremely Severe Side Effects,allergies,none - did nothing to help allergies. i just h...,I had some horrifying mental and physical side...,took one 10 mg pill nightly.


In [5]:
(df['sideEffects'].unique()).shape

(5,)

In [6]:
df.drop_duplicates(subset=['effectiveness', 'sideEffects']).shape

(25, 8)

In [7]:
df.groupby(by=['effectiveness','sideEffects'], as_index=False).ngroups

25

In [8]:
df2 = df.drop_duplicates(subset=['effectiveness', 'sideEffects'])

In [9]:
eff = df2.iloc[0]['effectiveness']
sidef = df2.iloc[0]['sideEffects']

In [10]:
df.loc[(df['sideEffects'] == sidef) & (df['effectiveness'] == eff)].shape

(482, 8)

In [11]:
sizes = []
combinations = []
for i in range(df2.shape[0]):
    eff = df2.iloc[i]['effectiveness']
    sidef = df2.iloc[i]['sideEffects']
    size = df.loc[(df['sideEffects'] == sidef) & (df['effectiveness'] == eff)].shape
    sizes.append(size[0] )
    combinations.append((eff,sidef))

In [12]:
sizes

[482,
 86,
 529,
 37,
 55,
 76,
 255,
 75,
 197,
 206,
 361,
 85,
 81,
 122,
 119,
 67,
 35,
 27,
 46,
 30,
 20,
 30,
 43,
 26,
 17]

In [13]:
combinations

[('Highly Effective', 'Mild Side Effects'),
 ('Highly Effective', 'Severe Side Effects'),
 ('Highly Effective', 'No Side Effects'),
 ('Marginally Effective', 'Mild Side Effects'),
 ('Marginally Effective', 'Severe Side Effects'),
 ('Ineffective', 'Severe Side Effects'),
 ('Considerably Effective', 'No Side Effects'),
 ('Ineffective', 'Extremely Severe Side Effects'),
 ('Considerably Effective', 'Moderate Side Effects'),
 ('Highly Effective', 'Moderate Side Effects'),
 ('Considerably Effective', 'Mild Side Effects'),
 ('Considerably Effective', 'Severe Side Effects'),
 ('Moderately Effective', 'No Side Effects'),
 ('Moderately Effective', 'Moderate Side Effects'),
 ('Moderately Effective', 'Mild Side Effects'),
 ('Moderately Effective', 'Severe Side Effects'),
 ('Marginally Effective', 'No Side Effects'),
 ('Highly Effective', 'Extremely Severe Side Effects'),
 ('Ineffective', 'Moderate Side Effects'),
 ('Ineffective', 'No Side Effects'),
 ('Ineffective', 'Mild Side Effects'),
 ('Consid

In [14]:
effectivess_scale = ['Ineffective',
                     'Marginally Effective',
                     'Moderately Effective', 
                     'Considerably Effective', 
                     'Highly Effective']

sideeffect_scale = ['Extremely Severe Side Effects',
                   'Severe Side Effects',
                   'Moderate Side Effects',
                   'Mild Side Effects',
                   'No Side Effects']

In [15]:
effs = []
sideffs = []
for i, row in df.iterrows():
    effs.append(effectivess_scale.index(row['effectiveness']))
    sideffs.append(sideeffect_scale.index(row['sideEffects']))
    
ratings = list(df['rating'])   

In [16]:
final_array = np.array([effs, sideffs, ratings])

In [17]:
np.savetxt('numerical_converted_data_drugslib.csv', final_array,delimiter=",")

In [19]:
final_array[0:2,:]

array([[4, 4, 4, ..., 1, 3, 2],
       [3, 1, 4, ..., 2, 3, 2]], dtype=int64)

In [33]:
df3 = (pd.DataFrame(final_array[0:2,:]).T).drop_duplicates()

In [37]:
basic_pairs = df3.drop_duplicates()

In [38]:
basic_pairs.shape

(25, 2)

In [41]:
distributions = np.zeros((basic_pairs.shape[0], 10))

In [77]:
pairs_sorted = []

for index, pair in basic_pairs.iterrows():
    pairs_sorted.append(pair[0] + pair[1])
    
pairs_sorted.sort()

In [79]:
pairs_sorted

[0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 7, 7, 8]

In [None]:
pairs_lookup.append(str(pair[0]) + str(pair[1]))

In [62]:
pairs_lookup = []
for index, pair in basic_pairs.iterrows():
    pairs_lookup.append(str(pair[0]) + str(pair[1]))

In [72]:
for idx, value in enumerate(final_array[0:2,:].T):
    pair_idx = pairs_lookup.index(str(value[0]) + str(value[1]))
    rating = final_array[2, idx]
    distributions[pair_idx, rating-1] += 1

In [71]:
final_array[2,2] =

10