In [None]:
#Importing modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from io import open
pd.options.display.max_columns = 150


In [None]:
#Loading dataset

#Obs.: This dataset are using TAB for separation values, then we use sep='\t'.

data = pd.read_csv('data-final.csv', sep='\t')

#Deleting all columns that we will not use (responding time, country, etc)

data.drop(data.columns[50:110], axis=1, inplace=True)


In [None]:
#Analising statistics in dataset

#Setting values to float
pd.options.display.float_format = "{:.2f}".format

#Show the statistics analysis
data.describe()

#Obs.: Was identified answers with 0 as value. Althrough the minimal is 1.
#Must need to repair this divergence

data["EXT1"].value_counts()

In [None]:
#Select all data with value equal zero

data[(data==0.00).all(axis=1)].describe()

In [None]:
#Cleaning all the data equal zero
data = data[(data>0.00).all(axis=1)]
data["EXT1"].value_counts()

In [None]:
#Search the amount of clusters

#Importing modules
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [None]:
#Setting KMeans and Visualizar
kmeans = KMeans()

#Setting k = 2 for kmeans (testing groups)
visualizer = KElbowVisualizer(kmeans,k=(2,10))

#Setting sample to 5000 items
data_sample = data.sample(n=5000, random_state = 1)

#Testing
visualizer.fit(data_sample)
visualizer.poof()

#Result: The KMeans Distortion Score indicates k=5

In [None]:
#Grouping the data in 5 clusters
kmeans = KMeans(n_clusters = 5)
k_fit = kmeans.fit(data)

In [None]:
#Naming the clusters
pred = k_fit.labels_
data['Clusters'] = pred

#Checking
data.head()

#data['Clusters'].value_counts()

#Ok, now we can identify a answer by his cluster!

In [None]:
#But if i need to identify the means of a cluster, what can i must to do?
#First i need to group data by clusters and get the mean
#Second i need to group the columns by the type of answer and get the mean (total/10)

#Grouping by clusters and mean
data.groupby('Clusters').mean()

In [None]:
#Setting columns of each group
col_list = list(data)
EXT = col_list[0:10]
EST = col_list[10:20]
AGR = col_list[20:30]
CSN = col_list[30:40]
OPN = col_list[40:50]

#Sum columns with the same type of personality and extract the mean
data_sum = pd.DataFrame()
data_sum['EXTROVERSION'] = data[EXT].sum(axis=1)/10
data_sum['NEUROTIC'] = data[EST].sum(axis=1)/10
data_sum['AGREEABLE'] = data[AGR].sum(axis=1)/10
data_sum['CONSCIENTIOUS'] = data[CSN].sum(axis=1)/10
data_sum['OPEN'] = data[OPN].sum(axis=1)/10
data_sum['clusters'] = pred

#Grouping the dataframe by clusters
data_sum.groupby('clusters').mean()

data_clusters = data_sum.groupby('clusters').mean()

In [None]:
#Ploting the means by group in a graph plot figure

plt.figure(figsize=(22,3))
for i in range(0,5):
    plt.subplot(1,5,i+1)
    plt.bar(data_clusters.columns, data_clusters.iloc[:,i], color='green', alpha=0.2)
    plt.plot(data_clusters.columns,data_clusters.iloc[:,i], color='red')
    plt.title('Group '+ str(i))
    plt.xticks(rotation=45)
    plt.ylim(0,4);


In [None]:
#Now we need to develop a interface to interact with the user
#The interface will get data to predict the user's cluster

import gradio as gr

In [None]:
#Reading survey's questions
#The questions are splitted by ENTER command, so we use split("\n")
d_questions = open("questions.txt").read().split("\n")
d_questions

In [None]:
#Obtaining only the questions
questions = []
for q in d_questions:
    q = str(q)
    #get all sentence after \t command and lstrip to remove blank spaces
    questions.append(q[q.find("\t"):].lstrip())
questions

In [None]:
#Create a list that receives data by the user
inputs_questions = []
for q in questions:
    obj_input = gr.inputs.Slider(minimum = 1, maximum = 5, step = 1,default = 3, label=q)
    inputs_questions.append(obj_input)

In [None]:
#Create a interface with gradio
def predict(*outputs_questions):
    outputs_questions = np.array(outputs_questions).reshape(1,-1)
    return k_fit.predict(outputs_questions)

iface = gr.Interface(
                    fn = predict,
                    title = "Big Five Personality",
                    description = "Personality Traits Detection System. The scale was labeled 1=Disagree, 3=Neutral, 5=Agree.",
                    inputs = inputs_questions,
                    outputs = "text")
iface.launch(share=True)