In [2]:
import glob,os
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [7]:
dirPath = 'C:/Users/rohan.gawankar/Documents/Python/trial-run-recruitment-task/data'
colNames = ['message','sentiment']
pattern = ".txt"

In [10]:
def findFilesInDir(dirPath, pattern):
    """Finds the list of filenames in `dirPath` that match `pattern`.
    
    dirPath : Path to the directory to find files in.
    pattern : A glob pattern to match filenames against.
    Returns : List of filenames in `dirPath` that match `pattern`.
    """
    os.chdir(dirPath)
    for file in glob.glob(pattern):
        print(file)

In [17]:
findFilesInDir("C:/Users/rohan.gawankar/Documents/Python/trial-run-recruitment-task/data","*.txt")

amazon_cells_labelled.txt
imdb_labelled.txt
readme.txt
yelp_labelled.txt


In [12]:
 def readMessagesFromFile(fPath, colNames, **kwargs):
    """Reads a file with messages and corresponding sentiments and
    returns a pandas DataFrame.

    fPath    : Path to the file to read messages from.
    colNames : Column names for the returned DataFrame.
    Returns  : Pandas DataFrame containing messages and sentiments
               as columns.

    Sample Output:
                                                 message  sentiment
    0                           Wow... Loved this place.          1
    1  I learned that if an electric slicer is used t...        NaN
    2                   But they don't clean the chiles?        NaN
    3                                 Crust is not good.          0
    4          Not tasty and the texture was just nasty.          0

    """
    print pd.read_csv(fPath,names=colNames,sep="\t").head(5)

In [14]:
readMessagesFromFile("C:/Users/rohan.gawankar/Documents/Python/trial-run-recruitment-task/data/imdb_labelled.txt",colNames)

                                             message  sentiment
0  A very, very, very slow-moving, aimless movie ...          0
1  Not sure who was more lost - the flat characte...          0
2  Attempting artiness with black & white and cle...          0
3       Very little music or anything to speak of.            0
4  The best scene in the movie was when Gerardo i...          1


In [18]:
"""For files in directory that match a pattern, return a dictionary of
    pandas DataFrames labeled by the filepaths.

    dirPath         : Path to the directory to find files in.
    fileNamePattern : A glob pattern to match filenames against.
    dfColNames      : Column names for the returned DataFrame.
    Returns         : Dictionary of DataFrames labeled by the filepaths.

    """

def readMessagesFromDir(dirPath, fileNamePattern, dfColNames, **kwargs):
    
    filelist = os.listdir(dirPath)
    list = []
    for file in filelist:
        if file.endswith(fileNamePattern):
            df = pd.read_csv(dirPath+"/"+file,sep="\t",names=dfColNames)
            df['label'] = file #os.path.join(dirPath,file)
            list.append(df)
    frame = pd.concat(list).reset_index(drop=True)
   
    df_dict = frame.to_dict()
    #df_dict = {k: v[['message','sentiment']] for k,v in frame.groupby('filepath')}
    return df_dict

In [22]:
msgDFDict = readMessagesFromDir(dirPath,pattern,colNames)

In [23]:
"""Make label for file from file path.

    fromFilePath : Input file path.
    Returns      : String label.

    Sample output:

    '../abc/xyz_labelled.txt' -> 'xyz'
    """

def makeLabel(fromFilePath):
    label = os.path.basename(fromFilePath)
    return label[:label.rfind('_')]

In [24]:
print makeLabel("C:/Users/Rohan/Downloads/trial-run-recruitment-task/trial-run-recruitment-task/data/amazon_cells_labelled.txt")

amazon_cells


In [25]:
"""Concatenate DataFrames by rows adding a column that indicates the
    source of the message. Make sure that the index for each row is unique.

    Sample output:

                                             messages  sentiment         label
    The CG opening sequence in space looked like i...          0          imdb
    Then one day, I went to use them and the recie...        NaN  amazon_cells
    And the pho is much better when it is served f...        NaN          yelp
    I have always had cases for my cell phones bec...        NaN  amazon_cells
                    I'll let you know how it goes....        NaN          yelp
                                  It looks very nice.          1  amazon_cells
        The Veggitarian platter is out of this world!          1          yelp
                  It looked like a wonderful story.            1          imdb
                       Too much hassle for my liking.        NaN  amazon_cells
    So far so good with this one, plus the glowing...        NaN  amazon_cells
    """

def concatDataFrames(msgDFDict,labelFunc):
    df = DataFrame(msgDFDict)
    df['label'] = df['label'].apply(labelFunc)
    return df

In [26]:

msgDF = concatDataFrames(msgDFDict,makeLabel)
msgDF = msgDF[np.isfinite(msgDF['sentiment'])].reset_index(drop=True)

msgDF.head()


Unnamed: 0,label,message,sentiment
0,amazon_cells,So there is no way for me to plug it in here i...,0
1,amazon_cells,"Good case, Excellent value.",1
2,amazon_cells,Great for the jawbone.,1
3,amazon_cells,Tied to charger for conversations lasting more...,0
4,amazon_cells,The mic is great.,1


In [27]:
def makeTermsFrom(msg):
    """Use this function to convert a message into vocabulary terms to
    avoid confusion regarding what is a valid term.
    """
    
    return [m for m in msg.lower().split() if m]

In [28]:
msgDF['message'] = msgDF['message'].apply(makeTermsFrom)

In [29]:
msgDF.head()

Unnamed: 0,label,message,sentiment
0,amazon_cells,"[so, there, is, no, way, for, me, to, plug, it...",0
1,amazon_cells,"[good, case,, excellent, value.]",1
2,amazon_cells,"[great, for, the, jawbone.]",1
3,amazon_cells,"[tied, to, charger, for, conversations, lastin...",0
4,amazon_cells,"[the, mic, is, great.]",1


In [30]:
def countVocabulary(msgsDF):
    """Take a DF of messages, sentiments, and labels and return a DF with
    terms, sentiments, labels, and the corresponding counts. Write
    whatever helper functions are required to achieve this task.

    Sample output:
              term  sentiment         label  count
    0            !          0  amazon_cells      1
    1            !          0          yelp      1
    2            !          1  amazon_cells      1
    3            !          1          imdb      1
    4            !          1          yelp      1
    5           !!          1  amazon_cells      1
    6          !!!          1          yelp      1
    7     !....the          0          yelp      1
    8          !2.          1  amazon_cells      1
    9           !i          1  amazon_cells      1
    10       "1.2"          1  amazon_cells      1
    11        "10"          1          imdb      1
    12          "a          1          imdb      1
    13     "about"          1          imdb      1
    14     "acting          1          imdb      1
    15        "are          0          yelp      1
    16        "art          1          imdb      1
    17         "at          1          imdb      1
    18        "big          1          imdb      1
    19  "breeders"          0          imdb      1
    20      "clip"          0  amazon_cells      1
    21   "collect"          0          imdb      1
    22    "crumby"          0          yelp      1
    23        "don          1          imdb      1
    24        "eel          0          yelp      1
    """
    df = DataFrame()
    for msg, stmt,lbl in zip(msgsDF.message,msgsDF.sentiment,msgsDF.label):
        for m in msg:
            df = df.append(DataFrame({'term':[m],'sentiment':[stmt],'label':[lbl]}))
    for name, group in df.groupby(['term','sentiment','label']):
        df['count'] = group.term.count()
    df = df.reset_index(drop=True)
    return df


In [35]:
counts = countVocabulary(msgDF.head())

In [36]:
counts

Unnamed: 0,label,sentiment,term,count
0,amazon_cells,0,so,1
1,amazon_cells,0,there,1
2,amazon_cells,0,is,1
3,amazon_cells,0,no,1
4,amazon_cells,0,way,1
5,amazon_cells,0,for,1
6,amazon_cells,0,me,1
7,amazon_cells,0,to,1
8,amazon_cells,0,plug,1
9,amazon_cells,0,it,1
