In [109]:
import pandas as pd
import numpy as np
import os

In [110]:
main_folder='C:/Users/shiri/OneDrive/Desktop/diagnostics'
dictionary_folder='dictionary'

dictionary_path=os.path.join(main_folder,dictionary_folder)
print(dictionary_path)

C:/Users/shiri/OneDrive/Desktop/diagnostics\dictionary


In [112]:
def check_duplicates(filename='pop.csv',main_folder='C:/Users/shiri/OneDrive/Desktop/diagnostics'):
    '''it will check for duplicate rows and single out all 
    the indices in the csv file which are duplicated'''
    
    #read in the dataframe
    df=pd.read_csv(os.path.join(main_folder,filename))
    df.columns.str.lower()
    
    #get the dataframe where we have duplicates, keep=False will keep both duplicated rows
    df_dup=df[df.duplicated(keep=False)]
    #groupby all the columns to get groups of duplicated rows
    grouped_df=df_dup.groupby(list(df.columns))

    #getting tuples of indices for duplicated rows. adding 2 to the index to match the index in csv file
    return [tuple(v.index+2) for k,v in grouped_df]
        

In [113]:
check_duplicates()

[]

In [114]:
def check_constraints(filename='pop.csv',dictionary_folder='dictionary',
                      main_folder='C:/Users/shiri/OneDrive/Desktop/diagnostics'):
    
    '''it will go through all the categorical columns and check for constraints violation
    by comparing each column to a dictionary file in a specified folder
    if there is a violation it will print out the indices of the csv datafile, the column involved 
    and the values which are not found in the dictionary for that specific column'''
    
    #separating date columns from the rest of the columns, named here as categ_cols
    df=pd.read_csv(os.path.join(main_folder,filename))
    df.columns=df.columns.str.lower()

    date_cols=[]
    categ_cols=[]

    for c in df.columns:
        try:
            date_col=str(int(c))
            date_cols.append(date_col)
        except ValueError:
            categ_cols.append(c)
            
    for f in categ_cols:
        try:
            df_dictionary=pd.read_excel(os.path.join(main_folder,dictionary_folder,f+'_dictionary.xlsx'))
            print(f'checking constraints for column: {f}')
            
            #check the constraints
            allowed_vals=df_dictionary.iloc[:,0].unique()
            #get unique values for column f for the main datafile
            unique_vals=df[f].unique()
            #get the difference in sets to exclude values in the main dataset not found in the dictionary
            diff=list(set(df[f].unique()).difference(set(df_dictionary.iloc[:,0].unique())))
            #get the index of the main csv file where there is an outlier value
            if len(diff)>0:
                idx=list(df[df[f].isin(diff)].index+2)
                print(f'WARNING !!! outliers in column: {f} and indices {idx} in the csv file')
                print(f'WARNING !!! the outliers are: {diff}\n')
            else:
                print('no violation for constraints\n')
                
        except:
            print(f'could not  find {f}')

In [None]:
check_constraints()

In [118]:
def iqr(filename='pop.csv',main_folder='C:/Users/shiri/OneDrive/Desktop/diagnostics'):
    
    '''this function will 
    1-extract the dataframe with dates as columns
    2-loop over rows and separate rows with less than 5 datapoints from the rest
    3-apply iqr on the rows with >=5 datapoints and incase of anomaly detection, save the row index alongside the column
    with the anomaly'''
    
    #separating date columns from the rest of the columns, named here as categ_cols
    df=pd.read_csv(os.path.join(main_folder,filename))
    df.columns=df.columns.str.lower()

    date_cols=[]
    categ_cols=[]

    for c in df.columns:
        try:
            #have to convert back to strings because in the original dataframe they are in strings
            date_col=str(int(c))
            date_cols.append(date_col)
        except ValueError:
            categ_cols.append(c)
            
    df_values=df[date_cols]
    
    #loop over rows to separate rows with <5 datapoints from the rest
    idx_gte5=[] #indices list for greater or equal to 5 datapoints
    idx_lt5=[]  #indices list for less or equal to 5 datapoints
    outlier_idx=[]  #outliers index
    outlier_col_idx=[]
    
    for idx,row in df_values.iterrows():
        #use item() to extract the integer from numpy.int64
        if row.count().item()>=5:
            idx_gte5.append(idx)
            #apply IQR on the row
            Q1=row.quantile(0.25)
            Q3=row.quantile(0.75)
            IQR=Q3-Q1

            lower_bound = Q1 - 5 * IQR
            upper_bound = Q3 + 5 * IQR
            
            #if there was at least one datapoint outside the range print the index of the row and 
            #the column index of the outlier
            bool=(row<lower_bound) | (row>upper_bound)
            if any(bool):
                outlier_idx.append(idx)
                
                #get the column indices for the outliers
                col_idx=list(row[bool].index)
                outlier_col_idx.append(col_idx)
            
        else:
            idx_lt5.append(idx)
    
    #write the dataframe with outliers to an excel file
    df_outlier=df.iloc[outlier_idx]
    #add the column indices as a column to the dataframe
    df_outlier['column_indices']=outlier_col_idx
    df_outlier.to_excel(main_folder+'/outliers.xlsx', index=False, sheet_name='Sheet1')
    
    #write the dataframe with less than 5 datapoints to an excel file
    df_lt5=df.iloc[idx_lt5]
    df_lt5.to_excel(main_folder+'/df_lt5.xlsx', index=False, sheet_name='Sheet1')
    #print out the indicator names
    unique_inds=df_lt5['indicator_name'].unique()
    print('the indicators below have less than 5 datapoints:\n')
    print(unique_inds)
    print('------------------------------------------------')
    
    df_gte5=df.iloc[idx_gte5]
    df_values_gte5=df_values.iloc[idx_gte5]
    
    #apply IQR for each row
    
    

In [119]:
iqr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_outlier['column_indices']=outlier_col_idx


the indicators below have less than 5 datapoints:

['Average annual population growth rate (percent)'
 'Average household size (number)' 'Average number of persons per room'
 'Households headed by women (percent)'
 'Mean age at first marriage (years)' 'Registered marriages (number)'
 'Registered divorces (number)' 'Registered deaths (number)'
 'Total fertility rate (children per woman)' 'Refugee population (number)'
 'International migrant stock (number)'
 'Population distribution by marital status (percentage)'
 'Population size (number)'
 'Population distribution by marital status (number)'
 'Registered livebirths (number)' 'Infant mortality rate (per 1'
 'Under-five mortality rate (per 1' 'Causes of death (percent)']
------------------------------------------------
