# Text as Data

## Libraries

In [7]:
# Importing necessary libraries
import os
import pandas as pd
import numpy as np

# DNP Contexto
from contexto.limpieza import *  # Custom cleaning methods (assumes a specific library)

# Text preprocessing and categorical encoding
from sklearn.feature_extraction.text import CountVectorizer  # Bag of Words model
from sklearn.preprocessing import LabelEncoder  # Encoding categorical variables

# Natural Language Processing
import spacy  # Lemmatization

# Dimensionality reduction
from sklearn.decomposition import PCA  # Principal Component Analysis
from sklearn.preprocessing import StandardScaler  # Standardizing features

# Model evaluation and training
import xgboost as xgb # XGBoost model
from sklearn.model_selection import train_test_split  # Splitting datasets
from sklearn.linear_model import LinearRegression, ElasticNet  # Regression models
from sklearn.metrics import mean_absolute_error  # Performance evaluation metric

# Graphs
import matplotlib.pyplot as plt

## Data Loading

In [8]:
# Directory
os.chdir("/Users/sergiosandovalcamargo/Desktop/Problem Set 3")  

# Train and test data
train = pd.read_csv("stores/data/raw/train_test/train.csv") 
test = pd.read_csv("stores/data/raw/train_test/test.csv") 

In [1]:
# Combined data 
train['is_test'] = 0  
test['is_test'] = 1   
data = pd.concat([train, test], ignore_index=True)

NameError: name 'train' is not defined

## Clean Text

### Normalization, Stop Words and Lemmatization

In [10]:
# Spanish stop words: 608
with open('stores/data/raw/external/spanish.txt', 'r', encoding='utf-8') as file:
    stopwords = {line.strip() for line in file}

# Additional Stop words 
additional_stopwords = ['vendo', 'venta', 'vende', 'etc', 'carrera', 'calle', 'casa', 'apto', 'apartamento',
                        'propiedad', 'venta', 'inmueble', 'cuarto', 'habitacion', 'excelente', 'ubicado', 'area',
                        'espectacular', 'magnifico', 'muy', 'vivienda', 'piso', 'alcoba', 'bano', 'bao', 'via', 'mas',
                        'consta', 'bogota', 'santa', 'mts', 'metro']

stopwords.update(additional_stopwords)

In [11]:
# Spacy model
nlp = spacy.load('es_core_news_sm', disable=['parser', 'ner'])

# Clean text function
def clean_text(text):
    if pd.isnull(text):  # Manejo de valores nulos
        return ""
    else:
        # First cleaning filter
        text_clean = limpieza_texto(text, quitar_acentos=True, n_min=3, lista_palabras=stopwords)
        
        # Lemmatization
        doc = nlp(text_clean)
        text_clean = ' '.join(token.lemma_ for token in doc)

        # Remove plural 's
        text_clean = re.sub(r'\b(\w+)s\b', r'\1', text_clean)
        
        # Second cleaning filter
        text_clean = limpieza_texto(text_clean, quitar_acentos=True, n_min=3, lista_palabras=stopwords)
        
        return text_clean

In [12]:
# Apply function
data['cleaned_text'] = data['description'].apply(clean_text)