In [None]:
### Helper functions

In [None]:
# stopwords that don't add value. Use this only for the wordcloud generation. 
stop_words = set(STOPWORDS)
# print(list(stop_words)[:5])

In [None]:
def load_data(path, datetime_colname=''):
    """Load data using pandas. Don't use for large datasets.
    Input-> path (required), datetime_colname (optional)- if passed, this col will be converted to datetime based on inference.
    Output-> a pandas df
    """
    
    if '.csv' in path:
        df = pd.read_csv(path, header= 0)
        df = df.dropna(how='all') # drop records where all columns have null value
        print("Columns :" , df.columns,"\n")
        print("Rows    :" , df.shape[0],"\n")
        
    elif '.json' in path:    
        try:
            df = pd.read_json(path)
            df = df.dropna(how='all') # drop records where all columns have null value
            print("Columns :" , df.columns,"\n")
            print("Rows    :" , df.shape[0],"\n")
        
        except:
            df = pd.read_json(path, lines=True)
            df = df.dropna(how='all') # drop records where all columns have null value
            print("Columns :" , df.columns,"\n")
            print("Rows    :" , df.shape[0],"\n")
            
    
    if datetime_colname != '':
        df[datetime_colname] = pd.to_datetime(df[datetime_colname])
        df = df.sort_values(datetime_colname, ascending=False)
    
    return df

In [None]:
# filter data (optional) 
def filter_data(df, colname, limiting_val, condition_val):
    """Input: pandas df, col to filter on, limiting_val to filter data, condition_val one of ('greater than', 'less_than')
       Output: filtered pandas df """
    
    try:
        if condition_val == 'greater than':
            condition = df[f'{colname}'] > limiting_val
            df = df[condition]
            print(f"Filtered data shape : {df.shape}")
        elif condition_val == 'less_than':
            condition = df[f'{colname}'] < limiting_val
            df = df[condition]
            print(f"Filtered data shape : {df.shape}")      
        return df
    except Exception as e:
        print(e.args)

In [None]:
def gen_wordclouds(df, textcolumnname, nafillvalue=''):
    """ Function to generate word clouds based on a pandas text column
        Input: a pandas df, a textual column, fill value (optional) if nulls in column
        Output: a wordcloud based on text column (minus stopwords) """
    
    if df[textcolumnname].isna().sum() != 0:
        print("Null count", df[textcolumnname].isna().sum())
        print("Text column cannot have nulls. Dtype error. Fix float -> str")
        print("Either fill na or don't plot NA data (defaults to not plot NAs)")
        
        df1 = df.copy(deep=True)
        try:
            nafillvalue == ''
            df1 = df.loc[df[textcolumnname].notna()]
            
        except:
            df1[textcolumnname].fillna(nafillvalue, inplace=True)
    
    else:
        df1 = df.copy(deep=True)

    text_data_to_generate = ''.join(df1[textcolumnname])
    wordcloud = WordCloud(stopwords = stop_words, width=1600 , height=800,background_color="White",colormap="Set2").generate(text_data_to_generate)
    plt.ion()
    plt.figure(figsize=(20,10),facecolor='k')
    plt.imshow(wordcloud,interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout(pad=0)
    return None

In [None]:
def plot_dist_series(df, colname):
    """ Inputs -> pandas df, numeric col to plot """
    try:
        return df.plot.hist(column=colname , title= df.name) # , by ='Product'
    
    except Exception as e:
        print(e.args)
        return None

In [None]:
def manual_groupby(df, columnname):
    """ Find all unique values in a column and create datasets for each grouping 
        Input: pandas df, columnname for which values need to be grouped 
        Output: a list of pandas dataframes, a pandas series with df sizes at same indexes (use pd.concat(list_of_dfs) to union). Each df in list is named by the group value"""
    
    uniques_vals = list(df[columnname].unique())
    list_of_dfs = [] 
    sizes = []
    for grp in uniques_vals:
    
        grouped_df = data.loc[data[columnname] == grp]
        grouped_df.name = grp
        
        list_of_dfs.append(grouped_df)
        
        sizes.append(grouped_df.shape[0])
        
    return list_of_dfs, pd.Series(sizes)

In [None]:
def plot_grouped_data(list_of_dfs, sizes, textcolumn, numcolumn, to_limit=False, n_limit=20):
    """Takes as input a list of dataframes and sizes of dataframes and prints dist of numeric column and wordcloud for textcolumn. Doesn't return anything
       Input: list of pandas dfs, pandas series with sizes of the dfs (same order), textcolumn name, numcolumn name, to_limit whether to limit the plotting to first n_limit dfs (desc), n_limit """
    
    index_by_largest_record_counts = sizes.sort_values(ascending=False).index.values
    
    if to_limit :
        limit_returned_results = index_by_largest_record_counts[ : n_limit]  # ignore this if list_of_dfs size is not very large. 
        
        for i in np.array(list_of_dfs)[limit_returned_results]:
            print(i.name)
            plot_dist_series(i, numcolumn)
            gen_wordclouds(i, textcolumn)
            
    else:
        for i in np.array(list_of_dfs)[index_by_largest_record_counts]:
            print(i.name)
            plot_dist_series(i, numcolumn)
            gen_wordclouds(i, textcolumn)

In [None]:
def get_sentiment(df, textcolname):
    """ Function to get polarity [-1,1] and subjectivity[0,1] from a text using TextBlob
        Input: pandas df, text col name to get sentiment
        Output: pandas df with 2 sentiment columns- sentiment_polarity, sentiment_subjectivity """
    
    
    data.loc[data[textcolname].notna() , 'sentiment_polarity'] = data[data[textcolname].notna()][textcolname].apply(lambda x: TextBlob(x).sentiment.polarity)
    data.loc[data[textcolname].notna() , 'sentiment_subjectivity'] = data[data[textcolname].notna()][textcolname].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    
    return data 