In [None]:
import pandas as pd 

df_real = pd.read_excel(open('theses_dissertations_with_pdfs.xlsx', 'rb'),
              sheet_name='escolhidos-real') 

df_extras = pd.read_excel(open('theses_dissertations_with_pdfs.xlsx', 'rb'),
              sheet_name='escolhidos-extendido-bloq') 

In [None]:
df_extras

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
year_counts1 = df_real['Year'].value_counts().reset_index()
year_counts2 = df_extras[df_extras['PDF_Name'] == 'Download Failed']['Year'].value_counts().reset_index()

year_counts1.columns = ['Year', 'Real']
year_counts2.columns = ['Year', 'Extra - sem acesso']

# Merge the datasets on 'Year'
merged_counts = pd.merge(year_counts2, year_counts1, on='Year', how='outer').fillna(0)

# Melt the DataFrame to long format for seaborn
melted_counts = pd.melt(merged_counts, id_vars='Year', value_vars=['Real', 'Extra - sem acesso'], var_name='Categoria', value_name='Qt de trabalhos')

melted_counts


In [None]:

# Plotting the stacked bar chart using Seaborn
fig = px.bar(melted_counts, x="Year", y="Qt de trabalhos", color="Categoria", text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)
fig.update_layout(
    xaxis=dict(
        tickmode='linear',
        dtick=1,  # Assumes 'Year' is in a yearly format. Adjust if needed.
        tickformat='%Y'  # Ensures the ticks are formatted as years
    ),
    width=800,  # Set the width of the figure (adjust as needed)
    height=400,   # Set the height of the figure (adjust as needed)
    xaxis_title='Ano',
    title='Teses e Dissertações em comp. quântica por ano'
)
fig.show()

In [None]:
import geobr
cidades_br = geobr.read_municipal_seat()
cidades_br

In [None]:
dados = {'name_muni':sorted(df_extras['City'].unique())}

In [None]:
df = pd.merge(pd.DataFrame(dados), cidades_br, on='name_muni', how='left')
df

In [None]:
# consertando santa maria, santo andré e são carlos
df = df.drop([15, 17, 20])

In [None]:
df['geometry'] = df['geometry'].astype('str')

In [None]:
import pandas as pd
import plotly.express as px

df_data_points = df_extras

df = df.rename(columns={"name_muni": "City"})
df_geometry = df

# Merge the data points with the geometry data on city name
df_merged = pd.merge(df_data_points, df_geometry, on='City', how='inner')

# Count occurrences of each city
df_counts = df_merged['City'].value_counts().reset_index()
df_counts.columns = ['City', 'count']

# Merge counts with geometry data
df_final = pd.merge(df_counts, df_geometry, on='City', how='inner')

# Extract latitude and longitude from the geometry column
def extract_lat_lon(geometry):
    point = geometry.split('POINT (')[1].replace(')', '').split()
    lon, lat = map(float, point)
    return lat, lon

# Apply the function to extract lat and lon
df_final[['lat', 'lon']] = df_final['geometry'].apply(lambda x: pd.Series(extract_lat_lon(x)))

# Categorize counts into bins and assign colors
bins = [0, 1, 3, 5, 7, 9, float('inf')]
labels = ['1', '2-3', '4-5', '6-7', '8-9', '10+']


df_final['count_bin'] = pd.cut(df_final['count'], bins=bins, labels=labels)

# Create a bubble plot map using Plotly
fig = px.scatter_mapbox(
    df_final,
    lat='lat',
    lon='lon',
    size='count',
    color='count_bin',
    hover_name='City',
    zoom=3,
    height=500,
    width=800,
    color_discrete_sequence=px.colors.sequential.Greens_r,
    size_max=30
)

# Update the map style
fig.update_layout(
    mapbox_style='open-street-map',
    legend_title="Contagem"
)

# Show the map
fig.show()

In [None]:
mais_univ = df_merged['Institution'].value_counts().reset_index()
fig = px.bar(mais_univ.head(), x='count', y='Institution', text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)

fig.update_layout(
    width=800,  # Set the width of the figure (adjust as needed)
    height=300,   # Set the height of the figure (adjust as needed)
    xaxis_title='Contagem de trabalhos',
    yaxis_title='Instituição'
)

In [None]:
mais_univ

In [None]:
# Group by city and count unique institutions
unique_city_counts = df_merged.groupby('City')['Institution'].nunique().reset_index()

# Rename the columns for clarity
unique_city_counts.columns = ['City', 'Count']

# Display the result
unique_city_counts.sort_values(by='Count', ascending=False).reset_index()

In [None]:
states = geobr.read_state(year=2020)

In [None]:
df_cont_estados = df_merged['abbrev_state'].value_counts().reset_index()

In [None]:
import pandas as pd
import plotly.express as px
from statistics import mean
estados = states

estados = estados[estados['abbrev_state'].isin(df_cont_estados['abbrev_state'])]
estados = pd.merge(estados, df_cont_estados, how='left', on='abbrev_state')
estados.index = list(estados['abbrev_state'])

# Define a discrete color sequence
discrete_colors = px.colors.sequential.Greens

fig = px.choropleth_mapbox(estados,
    geojson=estados.geometry,
    locations=estados.index,
    color="count",
    center={"lat": (((mean(list(estados.geometry.bounds.maxy))-mean(list(estados.geometry.bounds.miny)))/2)+mean(list(estados.geometry.bounds.miny))),
            "lon": (((mean(list(estados.geometry.bounds.maxx))-mean(list(estados.geometry.bounds.minx)))/2)+mean(list(estados.geometry.bounds.minx)))},
    labels={'index':'Município'},
    mapbox_style="open-street-map",
    zoom=3,
    color_continuous_scale= discrete_colors,
    height=500,
    width=800,
)
fig.update_layout(margin=dict(l=1, r=1, t=1, b=1), legend_title="Contagem")

fig.show()

In [None]:
estados[['count']].sort_values(by='count', ascending=False)

In [None]:
df_extras['Class'].value_counts().reset_index()

In [None]:
year_counts1 = df_real['Class'].value_counts().reset_index()
year_counts2 = df_extras[df_extras['PDF_Name'] == 'Download Failed']['Class'].value_counts().reset_index()

year_counts1.columns = ['Class', 'Real']
year_counts2.columns = ['Class', 'Extra - sem acesso']

# Merge the datasets on 'Year'
merged_counts = pd.merge(year_counts2, year_counts1, on='Class', how='outer').fillna(0)

# Melt the DataFrame to long format for seaborn
melted_counts = pd.melt(merged_counts, id_vars='Class', value_vars=['Real', 'Extra - sem acesso'], var_name='Categoria', value_name='Qt de trabalhos')

melted_counts['Total'] = melted_counts.groupby('Class')['Qt de trabalhos'].transform('sum')
print(melted_counts)

# Plotting the stacked bar chart using Seaborn
fig = px.bar(melted_counts.sort_values(by='Total'), y="Class", x="Qt de trabalhos", color="Categoria", text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)
fig.update_layout(
    xaxis=dict(
        tickmode='linear',
        dtick=1,  # Assumes 'Year' is in a yearly format. Adjust if needed.
        tickformat='%Y'  # Ensures the ticks are formatted as years
    ),
    width=800,  # Set the width of the figure (adjust as needed)
    height=300,   # Set the height of the figure (adjust as needed)
    xaxis_title='Qt. Trabalhos',
    title='Teses e Dissertações em comp. quântica por tema'
)
fig.show()

In [None]:
count_df = df_extras.groupby(['Year', 'Class']).size().reset_index(name='count')

# Criando o gráfico de barras
fig = px.bar(count_df, x='Year', y='count', color='Class', barmode='group', text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)
fig.update_layout(
   width=900,  # Set the width of the figure (adjust as needed)
    height=400
)
# Exibindo o gráfico
fig.show()

In [None]:
# Definindo a classe com base na coluna PDF_name
df_extras['disponibilidade'] = df_extras['PDF_Name'].apply(lambda x: 'Não disponível' if x == 'Download Failed' else 'Disponível')

# Contando as ocorrências de cada classe em cada ano
count_df = df_extras.groupby(['Class', 'disponibilidade']).size().reset_index(name='count')

# Criando o gráfico de barras
fig = px.bar(count_df, x='Class', y='count', color='disponibilidade', barmode='group', text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)
fig.update_layout(
   width=800,  # Set the width of the figure (adjust as needed)
    height=400
)
# Exibindo o gráfico
fig.show()

In [None]:
from wordcloud import WordCloud
from nltk.corpus import stopwords
import nltk
# necessário para remover letras únicas - problema de formatação com alguns em latex, em que os acentos ficam estranhos. Não tem muito como reformatar, apenas manualmente, então escolhi retirar 
import string

# Baixar stopwords se necessário
nltk.download('stopwords')

df = df_extras
# Converter a coluna 'Palavras' de lista para string e remover caracteres indesejados
df['Palavras'] = df['Palavras'].apply(lambda x: ' '.join(x))

# Concatenar todas as colunas de interesse em uma única string
text = ' '.join(df['Palavras'].astype(str) + ' ' + df['Resumo'].astype(str) + ' ' + df['Title'].astype(str))
#text = ' '.join(df['Title'].astype(str))
# Carregar stop words em português
stop_words = set(stopwords.words('portuguese'))

# Remover stop words e letras solitárias
text = ' '.join([word for word in text.split() if word.lower() not in stop_words and len(word) > 1])

# Gerar a word cloud
wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(text)

# Mostrar a word cloud
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# ideias: orientador, curso, média do número de páginas, existencia de financiamento

In [None]:
df_real

In [None]:
count_df = df_real['Teoria e experimento'].value_counts().reset_index()

# Criando o gráfico de barras
fig = px.bar(count_df, x='Teoria e experimento', y='count', barmode='group', text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2)
fig.update_layout(
   width=500,  # Set the width of the figure (adjust as needed)
    height=400,
    xaxis_title='Métodos',
    yaxis_title='Qt. trabalhos'
)
# Exibindo o gráfico
fig.show()

In [None]:
count_df = df_real['Lib'].value_counts().reset_index()

# Criando o gráfico de barras
fig = px.bar(count_df, x='Lib', y='count', barmode='group', text_auto=True, color_discrete_sequence= px.colors.qualitative.Set2[1:-1])
fig.update_layout(
   width=500,  # Set the width of the figure (adjust as needed)
    height=400,
    xaxis_title='Biblioteca para experimento',
    yaxis_title='Qt. trabalhos'
)
# Exibindo o gráfico
fig.show()

In [None]:
df_real.to_csv('csv_real.csv')

In [None]:
# número de estudos que não apresentam limitações.
df_real[df_real['Limitações'] == 'N']['Limitações'].count()

In [None]:
# número de estudos que não apresentam diretrizes para o futuro.
df_real[df_real['Futuro'] == 'N']['Futuro'].count()

In [None]:
df_real['Futuro-texto'].to_csv('futuro.csv')

In [None]:
for idx, i in df_real.iterrows():
    print(f"{i['Author']} ({i['Year']}) : {i['Futuro-texto']}")