<a href="https://colab.research.google.com/github/rmarvel23/projects/blob/main/Twitterprojectipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [106]:
import chardet
import pandas as pd

def detectar_encoding(archivo_csv):
    with open(archivo_csv, 'rb') as f:
        resultado = chardet.detect(f.read())
    return resultado['encoding']


In [None]:
detectar_encoding("/content/train.csv")

'Windows-1252'

In [107]:
def read_csv(csv, encoding):
    df = pd.read_csv(csv, encoding=encoding, sep= ",", on_bad_lines='skip')
    return df

In [108]:
df = read_csv('/content/train.csv','Windows-1252' )

In [109]:
class DataError(Exception):
    pass

In [114]:
class DataReviewer:

    def __init__(self,df):
      self.df = df
      self.data_cleaner = DataCleaner(df)

    def show_sample(self, n=5):
      try:
            return self.df.sample(n)
      except Exception as e:
            raise DataError(f"Error showing sample: {e}")

    def get_info(self):
        try:
            return self.df.info()
        except Exception as e:
            raise DataError(f"Error showing sample: {e}")

    def describe(self):
        try:
            return self.df.describe()
        except Exception as e:
            raise DataError(f"Error showing sample: {e}")

    def get_columns(self):
        try:
            return self.df.columns
        except Exception as e:
            raise DataError(f"Error showing sample: {e}")

    def count_rows(self, column):
        try:
            if column in self.df.columns:
                return self.df[column].count()
            else:
                raise DataError(f"Column '{column}' not found in dataset.")
        except Exception as e:
            raise DataError(f"Error counting rows: {e}")

    def get_unique(self,column):
        try:
            if column in self.df.columns:
                return self.df[column].unique()
            else:
                raise DataError(f"Column '{column}' not found in dataset.")
        except Exception as e:
            raise DataError(f"Error getting unique values: {e}")

    def check_nulls(self):
        try:
            return self.df.isnull().any()
        except Exception as e:
            raise DataError(f"Error checking nulls: {e}")

    def get_percentage_nulls(self):
        try:
            return self.df.isnull().sum() / len(self.df) * 100
        except Exception as e:
            raise DataError(f"Error checking nulls: {e}")

    def get_dtypes(self):
        try:
            return self.df.dtypes
        except Exception as e:
            raise DataError(f"Error checking nulls: {e}")

    def shape(self):
        try:
            return self.df.shape
        except Exception as e:
            raise DataError(f"Error getting shape: {e}")

    def count_unique(self, column):
        try:
            if column in self.df.columns:
                return self.df[column].nunique()
            else:
                raise DataError(f"Column '{column}' not found in dataset.")
        except Exception as e:
            raise DataError(f"Error counting unique values: {e}")

In [128]:

class DataCleaner:
    def __init__(self, df):
        self.df = df

    def drop_unnecessary_col(self):
        unnecessary_columns = ['textID', 'selected_text', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²);;;;;;;;;;;;']
        try:
            self.df = self.df.drop(columns=unnecessary_columns, errors='ignore')
            return self.df
        except KeyError as e:
            raise DataError(f"Error dropping columns: {e}")

    def remove_nulls(self):
        columns_to_check = ["sentiment", "Time of Tweet", "Age of User", "Country"]
        try:
            self.df = self.df.dropna(subset=columns_to_check)
            return self.df
        except KeyError as e:
            raise DataError(f"Error dropping nulls: {e}")

    def rename_col(self):
        try:
            self.df = self.df.rename(columns={"Time of Tweet": "Time", "Age of User": "Age"})
            return self.df
        except KeyError as e:
            raise DataError(f"Error renaming columns: {e}")

In [98]:
class WrongDataRemover():
    def __init__(self, df):
        self.df = df
    def remove_wrong_age(self):
        valid_ranges = ["0-20", "21-30", "31-45", "46-60", "60-70", "70-100"]
        self.df = self.df[self.df['Age of User'].isin(valid_ranges)]
        return self.df
    def remove_wrong_time(self):
        valid_data = ["noon", "night", "morning"]
        self.df = self.df[self.df['Time of Tweet'].isin(valid_data)]
        return self.df
    def remove_wrong_countries(self):
        valid_data = ['Albania', 'Algeria', 'Andorra', 'Antigua and Barbuda',
       'Argentina', 'Armenia', 'Australia', 'Austria', 'Bahamas',
       'Bahrain', 'Bangladesh', 'Barbados', 'Benin', 'Bhutan',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei',
       'Burkina Faso', 'Burundi', "Côte d'Ivoire", 'Cabo Verde',
       'Cambodia', 'Cameroon', 'Canada', 'Central African Republic',
       'China', 'Comoros', 'Croatia', 'Cuba', 'Czechia (Czech Republic)',
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji',
       'Finland', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana',
       'Greece', 'Grenada', 'Guatemala', 'Guinea-Bissau', 'Guyana',
       'Honduras', 'Hungary', 'India', 'Indonesia', 'Iraq', 'Ireland',
       'Israel', 'Italy', 'Japan', 'Jordan', 'Kenya', 'Kiribati',
       'Kuwait', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho',
       'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malaysia',
       'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Mauritania',
       'Mexico', 'Micronesia', 'Monaco', 'Montenegro', 'Morocco',
       'Mozambique', 'Myanmar (formerly Burma)', 'Nepal', 'Nicaragua',
       'Niger', 'North Korea', 'North Macedonia', 'Norway', 'Pakistan',
       'Palau', 'Palestine State', 'Panama', 'Papua New Guinea',
       'Paraguay', 'Philippines', 'Portugal', 'Rwanda', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Samoa',
       'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia',
       'Seychelles', 'Sierra Leone', 'Singapore', 'Slovenia', 'Somalia',
       'South Korea', 'South Sudan', 'Spain', 'Sri Lanka', 'Sudan',
       'Suriname', 'Sweden', 'Switzerland', 'Syria', 'Tajikistan',
       'Tanzania', 'Thailand', 'Timor-Leste', 'Togo', 'Tonga',
       'Trinidad and Tobago', 'Tunisia', 'Turkmenistan', 'Tuvalu',
       'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom',
       'United States of America', 'Uruguay', 'Uzbekistan', 'Vanuatu',
       'Venezuela', 'Yemen', 'Zambia', 'Zimbabwe', 'Afghanistan',
       'Angola', 'Azerbaijan', 'Belgium', 'Belize', 'Bolivia', 'Bulgaria',
       'Chad', 'Chile', 'Costa Rica', 'Cyprus', 'Dominica', 'Egypt',
       'France', 'Haiti', 'Iceland', 'Jamaica', 'Madagascar', 'Malawi',
       'Namibia', 'Nauru', 'Netherlands', 'New Zealand', 'Nigeria',
       'Oman', 'Poland', 'Qatar', 'Russia', 'Saint Kitts and Nevis',
       'San Marino', 'Slovakia', 'Solomon Islands', 'South Africa',
       'Turkey', 'Vietnam', 'Belarus', 'Colombia',
       'Congo (Congo-Brazzaville)', 'Guinea', 'Holy See', 'Kazakhstan',
       'Moldova', 'Mongolia', 'Peru', 'Romania', 'Iran', 'Mauritius', 'Liberia', 'Eswatini (fmr. """"Swaziland"""")""']
        self.df = self.df[self.df['Country'].isin(valid_data)]
        return self.df

    def change_country_names():
        replace_dict = {"Czechia (Czech Republic)" : "Czechia",
                        "Democratic Republic of the Congo": "R.D. Congo",
                        "Myanmar (formerly Burma)": "Myanmar",
                        "Congo (Congo-Brazzaville)" : "Congo-Brazzaville",
                        "Eswatini (fmr. """"Swaziland"""")""" : "Eswatini"


        }
        self.df['Country'] = df['Country'].replace(replace_dict)
        return df

    def remove_wrong_sentiment(self):
        valid_sentiment = ["positive", "negative", "neutral"]
        self.df = self.df[self.df['sentiment'].isin(valid_sentiment)]
        return self.df


In [105]:
from abc import ABC, abstractmethod
import matplotlib.pyplot as plt

class DataVisualizer(ABC):
    def __init__(self, df):
        self.df = df

    @abstractmethod
    def get_positive(self):
        pass

    @abstractmethod
    def get_negative(self):
        pass

    @abstractmethod
    def get_sentiment(self):
        pass

    @abstractmethod
    def compare_sentiment(self):
        pass

class CountryAnalyzer(DataVisualizer):

    def __init__(self, df, country):
        super().__init__(df)
        self.country = country

    def get_positive(self):
        try:
            return self.df[(self.df['Country'] == self.country) & (self.df['sentiment'] == "positive")].shape[0]
        except Exception as e:
            print(f"Error getting positive sentiment for {self.country}: {e}")

    def get_negative(self):
        try:
            return self.df[(self.df['Country'] == self.country) & (self.df['sentiment'] == "negative")].shape[0]
        except Exception as e:
            print(f"Error getting negative sentiment for {self.country}: {e}")

    def get_sentiment(self):
        try:
            sentiment_counts = self.df[self.df['Country'] == self.country]['sentiment'].value_counts()
            sentiment_counts.plot(kind='bar', figsize=(10, 6))
            plt.title(f'Tweets in {self.country}')
            plt.xlabel('Sentiment')
            plt.ylabel('Number of Tweets')
            plt.show()
        except Exception as e:
            print(f"Error plotting sentiment for {self.country}: {e}")

    def compare_sentiment(self, sentiment="negative"):
        try:
            tweets_by_country = self.df[self.df['sentiment'] == sentiment]['Country'].value_counts()
            top_countries = tweets_by_country.head(10).sort_values(ascending=False)
            plt.figure(figsize=(12, 8))

            colors = plt.cm.tab20(range(len(top_countries)))

            bars = plt.bar(top_countries.index, top_countries.values, color=colors)
            plt.legend(top_countries.index, loc='upper right')
            plt.xticks(rotation=45, ha='right', fontsize=7)

            plt.show()
        except KeyError as e:
            print(f"KeyError: {e}. Make sure 'sentiment' and 'Country' columns exist in the DataFrame.")
        except Exception as e:
            print(f"Error comparing sentiment: {e}")

class AgeAnalyzer(DataVisualizer):

    def __init__(self, df, age):
        super().__init__(df)
        self.age = age

    def get_positive(self):
        try:
            return self.df[(self.df['Age'] == self.age) & (self.df['sentiment'] == "positive")].shape[0]
        except Exception as e:
            print(f"Error getting positive sentiment for {self.age} years old: {e}")

    def get_negative(self):
        try:
            return self.df[(self.df['Age'] == self.age) & (self.df['sentiment'] == "negative")].shape[0]
        except Exception as e:
            print(f"Error getting negative sentiment for {self.age} years old: {e}")

    def get_sentiment(self):
        try:
            sentiment_counts = self.df[self.df['Age'] == self.age]['sentiment'].value_counts()
            sentiment_counts.plot(kind='bar', figsize=(10, 6))
            plt.title(f'Tweets by people of {self.age} years old')
            plt.xlabel('Sentiment')
            plt.ylabel('Number of Tweets')
            plt.show()
        except Exception as e:
            print(f"Error plotting sentiment for {self.age} years old: {e}")

    def compare_sentiment(self, sentiment="negative"):
        try:
            tweets_by_age = self.df[self.df['sentiment'] == sentiment]['Age'].value_counts()
            top_ages = tweets_by_age.head(10).sort_values(ascending=False)
            plt.figure(figsize=(12, 8))

            colors = plt.cm.tab20(range(len(top_ages)))

            bars = plt.bar(top_ages.index, top_ages.values, color=colors)
            plt.legend(top_ages.index, loc='upper right')
            plt.xticks(rotation=45, ha='right', fontsize=7)

            plt.show()
        except KeyError as e:
            print(f"KeyError: {e}. Make sure 'sentiment' and 'Age' columns exist in the DataFrame.")
        except Exception as e:
            print(f"Error comparing sentiment: {e}")


Solucionar los nombres de países solapados. Asegurarme de que el código de matplotlib está bien. Hacer otro tipo de gráficas para comparar países. Hacer otra clase que se base en la abstracta pero que sea para comparar edades.
----
Usar excepciones personalizadas
